LLMs-Universal-Life-Science-and-Clinical-Skills- cnvkit-analysis

<!--

install
source · Clone the upstream repo
git clone https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills-
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills- "$T" && mkdir -p ~/.claude/skills && cp -r "$T/Skills/Genomics/copy-number/cnvkit-analysis" ~/.claude/skills/mdbabumiamssm-llms-universal-life-science-and-clinical-skills-cnvkit-analysis && rm -rf "$T"
manifest: Skills/Genomics/copy-number/cnvkit-analysis/SKILL.md
source content
<!-- # COPYRIGHT NOTICE # This file is part of the "Universal Biomedical Skills" project. # Copyright (c) 2026 MD BABU MIA, PhD <md.babu.mia@mssm.edu> # All Rights Reserved. # # This code is proprietary and confidential. # Unauthorized copying of this file, via any medium is strictly prohibited. # # Provenance: Authenticated by MD BABU MIA -->

name: bio-copy-number-cnvkit-analysis description: Detect copy number variants from targeted/exome sequencing using CNVkit. Supports tumor-normal pairs, tumor-only, and germline CNV calling. Use when detecting CNVs from WES or targeted panel sequencing data. tool_type: cli primary_tool: cnvkit measurable_outcome: Execute skill workflow successfully with valid output within 15 minutes. allowed-tools:

  • read_file
  • run_shell_command

CNVkit CNV Analysis

Basic Workflow

# Complete pipeline for tumor-normal pair
cnvkit.py batch tumor.bam \
    --normal normal.bam \
    --targets targets.bed \
    --fasta reference.fa \
    --output-reference my_reference.cnn \
    --output-dir results/

Build Reference from Normal Samples

# Step 1: Build reference from multiple normals (recommended)
cnvkit.py batch \
    --normal normal1.bam normal2.bam normal3.bam \
    --targets targets.bed \
    --fasta reference.fa \
    --output-reference pooled_reference.cnn

# Step 2: Run on tumor samples using pre-built reference
cnvkit.py batch tumor1.bam tumor2.bam \
    --reference pooled_reference.cnn \
    --output-dir results/

Flat Reference (No Matched Normal)

# When no matched normal is available
cnvkit.py batch tumor.bam \
    --targets targets.bed \
    --fasta reference.fa \
    --output-reference flat_reference.cnn \
    --output-dir results/

WGS Mode

# For whole genome sequencing (no targets file)
cnvkit.py batch tumor.bam \
    --normal normal.bam \
    --fasta reference.fa \
    --method wgs \
    --output-dir results/

Step-by-Step Pipeline

# 1. Generate target and antitarget regions
cnvkit.py target targets.bed --annotate refFlat.txt -o targets.target.bed
cnvkit.py antitarget targets.bed -o targets.antitarget.bed

# 2. Calculate coverage
cnvkit.py coverage tumor.bam targets.target.bed -o tumor.targetcoverage.cnn
cnvkit.py coverage tumor.bam targets.antitarget.bed -o tumor.antitargetcoverage.cnn
cnvkit.py coverage normal.bam targets.target.bed -o normal.targetcoverage.cnn
cnvkit.py coverage normal.bam targets.antitarget.bed -o normal.antitargetcoverage.cnn

# 3. Build reference
cnvkit.py reference normal.targetcoverage.cnn normal.antitargetcoverage.cnn \
    --fasta reference.fa -o reference.cnn

# 4. Fix and call
cnvkit.py fix tumor.targetcoverage.cnn tumor.antitargetcoverage.cnn reference.cnn -o tumor.cnr
cnvkit.py segment tumor.cnr -o tumor.cns
cnvkit.py call tumor.cns -o tumor.call.cns

Segmentation Options

# Default CBS (Circular Binary Segmentation)
cnvkit.py segment sample.cnr -o sample.cns

# Use HMM for better performance
cnvkit.py segment sample.cnr --method hmm -o sample.cns

# Adjust smoothing
cnvkit.py segment sample.cnr --smooth-cbs -o sample.cns

CNV Calling with Ploidy/Purity

# Specify tumor purity and ploidy
cnvkit.py call sample.cns \
    --purity 0.7 \
    --ploidy 2 \
    -o sample.call.cns

# With B-allele frequencies (from VCF)
cnvkit.py call sample.cns \
    --vcf sample.vcf \
    --purity 0.7 \
    -o sample.call.cns

Export Results

# Export to BED format
cnvkit.py export bed sample.call.cns -o sample.cnv.bed

# Export to VCF
cnvkit.py export vcf sample.call.cns -o sample.cnv.vcf

# Export segments for GISTIC2
cnvkit.py export seg *.cns -o samples.seg

# Export for Nexus
cnvkit.py export nexus-basic sample.cnr -o sample.nexus.txt

Visualization

# Scatter plot with segments
cnvkit.py scatter sample.cnr -s sample.cns -o sample_scatter.png

# Single chromosome
cnvkit.py scatter sample.cnr -s sample.cns -c chr17 -o sample_chr17.png

# Diagram (ideogram style)
cnvkit.py diagram sample.cnr -s sample.cns -o sample_diagram.pdf

# Heatmap across samples
cnvkit.py heatmap *.cns -o heatmap.pdf

Key Output Files

ExtensionDescription
.cnnReference or coverage file
.cnrCopy ratios (log2) per bin
.cnsSegmented copy ratios
.call.cnsCalled copy number states

Python API

import cnvlib

# Load data
cnr = cnvlib.read('sample.cnr')
cns = cnvlib.read('sample.cns')

# Filter by chromosome
chr17 = cnr[cnr.chromosome == 'chr17']

# Get amplifications
amps = cns[cns['log2'] > 0.5]

# Get deletions
dels = cns[cns['log2'] < -0.5]

# Export
cnr.to_csv('sample.cnr.tsv', sep='\t', index=False)

Quality Control

# Check reference quality
cnvkit.py metrics *.cnr -s *.cns

# Check for gender mismatches
cnvkit.py sex *.cnr *.cnn

# Median absolute deviation (lower is better)
# Biweight midvariance (sample heterogeneity)

Key Parameters

ParameterDefaultDescription
--methodhybridhybrid, wgs, amplicon
--segment-methodcbscbs, hmm, haar, none
--drop-low-coverageonDrop low-coverage bins
--purity1.0Tumor purity (0-1)
--ploidy2Sample ploidy
--thresholds-1.1,-0.25,0.2,0.7CN state thresholds

Related Skills

  • alignment-files/bam-statistics - QC of input BAMs
  • copy-number/cnv-visualization - Advanced plotting
  • copy-number/cnv-annotation - Gene-level annotation
  • long-read-sequencing/structural-variants - Complementary SV calling
<!-- AUTHOR_SIGNATURE: 9a7f3c2e-MD-BABU-MIA-2026-MSSM-SECURE -->