install
source · Clone the upstream repo
git clone https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills-
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills- "$T" && mkdir -p ~/.claude/skills && cp -r "$T/Skills/Genomics/copy-number/cnv-annotation" ~/.claude/skills/mdbabumiamssm-llms-universal-life-science-and-clinical-skills-cnv-annotation && rm -rf "$T"
manifest:
Skills/Genomics/copy-number/cnv-annotation/SKILL.mdsource content
<!--
# COPYRIGHT NOTICE
# This file is part of the "Universal Biomedical Skills" project.
# Copyright (c) 2026 MD BABU MIA, PhD <md.babu.mia@mssm.edu>
# All Rights Reserved.
#
# This code is proprietary and confidential.
# Unauthorized copying of this file, via any medium is strictly prohibited.
#
# Provenance: Authenticated by MD BABU MIA
-->
name: bio-copy-number-cnv-annotation description: Annotate CNVs with genes, pathways, and clinical significance. Use when interpreting CNV calls or identifying affected genes from copy number analysis. tool_type: mixed primary_tool: bedtools measurable_outcome: Execute skill workflow successfully with valid output within 15 minutes. allowed-tools:
- read_file
- run_shell_command
CNV Annotation
Annotate with Gene Names (bedtools)
# Convert CNV segments to BED awk 'NR>1 {print $1"\t"$2"\t"$3"\t"$5"\t"$6}' sample.cns > sample.cnv.bed # Intersect with gene annotations bedtools intersect -a sample.cnv.bed -b genes.bed -wa -wb > cnv_genes.txt # Get genes overlapping CNVs bedtools intersect -a genes.bed -b sample.cnv.bed -u > affected_genes.bed
CNVkit Gene Annotation
# Annotate during analysis cnvkit.py batch tumor.bam --normal normal.bam \ --targets targets.bed \ --annotate refFlat.txt \ --fasta reference.fa \ -o results/ # Genes are included in output CNS file
Python: Comprehensive Annotation
import pandas as pd import pybedtools as pbt def annotate_cnvs(cns_file, gene_bed, output=None): '''Annotate CNV segments with overlapping genes.''' cns = pd.read_csv(cns_file, sep='\t') # Create BED from segments cns_bed = pbt.BedTool.from_dataframe( cns[['chromosome', 'start', 'end', 'log2']].rename( columns={'chromosome': 'chrom'})) genes = pbt.BedTool(gene_bed) # Intersect intersect = cns_bed.intersect(genes, wa=True, wb=True) # Parse results results = [] for interval in intersect: results.append({ 'chrom': interval[0], 'start': int(interval[1]), 'end': int(interval[2]), 'log2': float(interval[3]), 'gene_chrom': interval[4], 'gene_start': int(interval[5]), 'gene_end': int(interval[6]), 'gene_name': interval[7] if len(interval) > 7 else 'NA' }) df = pd.DataFrame(results) # Aggregate genes per CNV cnv_genes = df.groupby(['chrom', 'start', 'end', 'log2'])['gene_name'].apply( lambda x: ','.join(sorted(set(x)))).reset_index() if output: cnv_genes.to_csv(output, sep='\t', index=False) return cnv_genes
Annotate with Cancer Gene Census
import pandas as pd def annotate_cancer_genes(cnv_genes, cgc_file): '''Flag cancer-associated genes in CNVs.''' cgc = pd.read_csv(cgc_file, sep='\t') cancer_genes = set(cgc['Gene Symbol'].tolist()) def check_cancer_genes(genes): if pd.isna(genes): return '' gene_list = genes.split(',') cancer = [g for g in gene_list if g in cancer_genes] return ','.join(cancer) cnv_genes['cancer_genes'] = cnv_genes['gene_name'].apply(check_cancer_genes) cnv_genes['n_cancer_genes'] = cnv_genes['cancer_genes'].apply( lambda x: len(x.split(',')) if x else 0) return cnv_genes
Annotate with ACMG/ClinVar
def annotate_clinvar_cnvs(cnv_bed, clinvar_vcf): '''Annotate CNVs with ClinVar variants.''' import pysam cnv = pd.read_csv(cnv_bed, sep='\t', header=None, names=['chrom', 'start', 'end', 'log2']) vcf = pysam.VariantFile(clinvar_vcf) results = [] for _, row in cnv.iterrows(): chrom = row['chrom'].replace('chr', '') pathogenic = [] for rec in vcf.fetch(chrom, row['start'], row['end']): clnsig = rec.info.get('CLNSIG', [''])[0] if 'pathogenic' in clnsig.lower(): gene = rec.info.get('GENEINFO', 'NA').split(':')[0] pathogenic.append(gene) results.append({ 'chrom': row['chrom'], 'start': row['start'], 'end': row['end'], 'log2': row['log2'], 'clinvar_pathogenic': ','.join(set(pathogenic)) }) return pd.DataFrame(results)
GISTIC2 for Recurrent CNVs
# Export segments for GISTIC cnvkit.py export seg *.cns -o cohort.seg # Run GISTIC2 gistic2 \ -b results/ \ -seg cohort.seg \ -refgene hg38.refgene.mat \ -genegistic 1 \ -smallmem 1 \ -broad 1 \ -brlen 0.5 \ -conf 0.90 \ -armpeel 1 \ -savegene 1 # Output: significant regions with genes
AnnotSV for Comprehensive Annotation
# Export CNVs to VCF cnvkit.py export vcf sample.cns -o sample.cnv.vcf # Run AnnotSV AnnotSV \ -SVinputFile sample.cnv.vcf \ -genomeBuild GRCh38 \ -outputFile sample_annotated \ -SVminSize 1000 # Output includes: genes, DGV, gnomAD-SV, ClinVar, OMIM
R: Gene Enrichment of CNV Regions
library(clusterProfiler) library(org.Hs.eg.db) # Get genes in amplified regions amp_genes <- unique(cnv_annotation$gene_name[cnv_annotation$log2 > 0.5]) # Convert to Entrez IDs entrez_ids <- mapIds(org.Hs.eg.db, keys=amp_genes, keytype='SYMBOL', column='ENTREZID') entrez_ids <- na.omit(entrez_ids) # GO enrichment go_results <- enrichGO( gene=entrez_ids, OrgDb=org.Hs.eg.db, ont='BP', pAdjustMethod='BH', qvalueCutoff=0.05 ) # KEGG enrichment kegg_results <- enrichKEGG( gene=entrez_ids, organism='hsa', pAdjustMethod='BH' )
Interpret CNV States
def interpret_cnv(log2, purity=1.0, ploidy=2): '''Interpret CNV log2 ratio as copy number state.''' # Adjusted for purity cn = 2 * (2 ** log2) if cn < 0.5: return 'homozygous_deletion' elif cn < 1.5: return 'heterozygous_deletion' elif cn < 2.5: return 'diploid' elif cn < 3.5: return 'single_copy_gain' else: return 'amplification' def summarize_cnvs(cns_annotated): '''Summarize CNV calls.''' cns_annotated['cnv_type'] = cns_annotated['log2'].apply(interpret_cnv) summary = { 'total_cnvs': len(cns_annotated), 'amplifications': (cns_annotated['cnv_type'] == 'amplification').sum(), 'deletions': (cns_annotated['cnv_type'].str.contains('deletion')).sum(), 'total_genes': cns_annotated['gene_name'].str.split(',').explode().nunique(), 'cancer_genes': cns_annotated['cancer_genes'].str.split(',').explode().nunique() } return summary
Output Report
def generate_cnv_report(cns_annotated, output_prefix): '''Generate CNV annotation report.''' # Full annotation table cns_annotated.to_csv(f'{output_prefix}_full.tsv', sep='\t', index=False) # High-impact CNVs high_impact = cns_annotated[ (cns_annotated['n_cancer_genes'] > 0) | (abs(cns_annotated['log2']) > 1) ] high_impact.to_csv(f'{output_prefix}_high_impact.tsv', sep='\t', index=False) # Gene-level summary genes = cns_annotated.explode('gene_name') gene_summary = genes.groupby('gene_name').agg({ 'log2': 'mean', 'chrom': 'first', 'start': 'min', 'end': 'max' }).reset_index() gene_summary.to_csv(f'{output_prefix}_genes.tsv', sep='\t', index=False) return high_impact
Related Skills
- copy-number/cnvkit-analysis - Generate CNV calls
- copy-number/cnv-visualization - Visualize annotated CNVs
- pathway-analysis/go-enrichment - Enrichment of CNV genes
- genome-intervals/bed-file-basics - BED file operations