install
source · Clone the upstream repo
git clone https://github.com/FreedomIntelligence/OpenClaw-Medical-Skills
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/FreedomIntelligence/OpenClaw-Medical-Skills "$T" && mkdir -p ~/.claude/skills && cp -r "$T/skills/variant-interpretation-acmg/bioSkills/vcf-statistics" ~/.claude/skills/freedomintelligence-openclaw-medical-skills-vcf-statistics && rm -rf "$T"
OpenClaw · Install into ~/.openclaw/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/FreedomIntelligence/OpenClaw-Medical-Skills "$T" && mkdir -p ~/.openclaw/skills && cp -r "$T/skills/variant-interpretation-acmg/bioSkills/vcf-statistics" ~/.openclaw/skills/freedomintelligence-openclaw-medical-skills-vcf-statistics && rm -rf "$T"
manifest:
skills/variant-interpretation-acmg/bioSkills/vcf-statistics/SKILL.mdsource content
<!--
# COPYRIGHT NOTICE
# This file is part of the "Universal Biomedical Skills" project.
# Copyright (c) 2026 MD BABU MIA, PhD <md.babu.mia@mssm.edu>
# All Rights Reserved.
#
# This code is proprietary and confidential.
# Unauthorized copying of this file, via any medium is strictly prohibited.
#
# Provenance: Authenticated by MD BABU MIA
-->
name: bio-vcf-statistics description: Generate variant statistics, sample concordance, and quality metrics using bcftools stats and gtcheck. Use when evaluating variant quality, comparing samples, or summarizing VCF contents. tool_type: cli primary_tool: bcftools measurable_outcome: Execute skill workflow successfully with valid output within 15 minutes. allowed-tools:
- read_file
- run_shell_command
VCF Statistics
Generate statistics and quality metrics using bcftools.
Statistics Tools
| Command | Purpose |
|---|---|
| Comprehensive variant statistics |
| Sample concordance and relatedness |
| Custom summaries |
bcftools stats
Basic Statistics
bcftools stats input.vcf.gz > stats.txt
View Key Metrics
bcftools stats input.vcf.gz | grep "^SN"
Output sections:
- Summary numbersSN
- Transitions/transversionsTSTV
- Singleton statsSiS
- Allele frequency distributionAF
- Quality distributionQUAL
- Indel distributionIDD
- Substitution typesST
- Depth distributionDP
Summary Numbers (SN)
bcftools stats input.vcf.gz | grep "^SN" | cut -f3-
Reports:
- Number of samples
- Number of records
- Number of SNPs
- Number of indels
- Number of multiallelic sites
- Number of multiallelic SNPs
Transition/Transversion Ratio
bcftools stats input.vcf.gz | grep "^TSTV"
Expected Ti/Tv ratio:
- Whole genome: ~2.0-2.1
- Exome: ~2.8-3.3
Per-Sample Statistics
bcftools stats -s - input.vcf.gz > per_sample.txt
Compare Two VCFs
bcftools stats input1.vcf.gz input2.vcf.gz > comparison.txt
Region-Specific Stats
bcftools stats -r chr1:1000000-2000000 input.vcf.gz > region_stats.txt
Exome Statistics
bcftools stats -R exome.bed input.vcf.gz > exome_stats.txt
Plotting Statistics
Generate Plots
bcftools stats input.vcf.gz > stats.txt plot-vcfstats -p output_dir stats.txt
Creates:
output_dir/summary.pdf- Individual PNG files
Comparison Plots
bcftools stats file1.vcf.gz file2.vcf.gz > comparison.txt plot-vcfstats -p comparison_dir comparison.txt
bcftools gtcheck
Check Sample Identity
bcftools gtcheck -g reference.vcf.gz query.vcf.gz
Reports concordance between samples.
Detect Sample Swaps
bcftools gtcheck -G 1 input.vcf.gz > relatedness.txt
Compares all samples pairwise.
Output Format
DC 0 sample1 sample2 0.95 1234 1200
Fields:
- DC: Data type (discordance)
- Index
- Sample 1
- Sample 2
- Discordance rate
- Sites compared
- Discordant sites
Check Against Reference Panel
bcftools gtcheck -g 1000genomes.vcf.gz unknown_sample.vcf.gz
Quick Statistics with Query
Count Variants
bcftools view -H input.vcf.gz | wc -l
Count by Type
# SNPs bcftools view -v snps -H input.vcf.gz | wc -l # Indels bcftools view -v indels -H input.vcf.gz | wc -l
Count PASS Variants
bcftools view -f PASS -H input.vcf.gz | wc -l
Quality Distribution
bcftools query -f '%QUAL\n' input.vcf.gz | \ awk '{sum+=$1; count++} END {print "Mean QUAL:", sum/count}'
Depth Distribution
bcftools query -f '%INFO/DP\n' input.vcf.gz | \ awk '{sum+=$1; count++} END {print "Mean DP:", sum/count}'
Genotype Counts
# Count heterozygous sites per sample bcftools query -f '[%GT\t]\n' input.vcf.gz | \ awk -F'\t' '{for(i=1;i<=NF;i++) if($i=="0/1" || $i=="0|1") het[i]++} END {for(i in het) print "Sample", i, "het:", het[i]}'
Allele Frequency Spectrum
bcftools query -f '%INFO/AF\n' input.vcf.gz | \ awk '{ if($1<0.01) rare++ else if($1<0.05) low++ else if($1<0.5) common++ else freq++ } END { print "Rare (<1%):", rare print "Low (1-5%):", low print "Common (5-50%):", common print "Frequent (>50%):", freq }'
Sample Statistics
List Samples
bcftools query -l input.vcf.gz
Count Samples
bcftools query -l input.vcf.gz | wc -l
Per-Sample Variant Counts
for sample in $(bcftools query -l input.vcf.gz); do count=$(bcftools view -s "$sample" -H input.vcf.gz | \ bcftools view -c 1 -H | wc -l) echo "$sample: $count" done
Missing Genotypes per Sample
bcftools stats -s - input.vcf.gz | grep "^PSC"
cyvcf2 Statistics
Basic Counts
from cyvcf2 import VCF stats = {'snps': 0, 'indels': 0, 'other': 0} for variant in VCF('input.vcf.gz'): if variant.is_snp: stats['snps'] += 1 elif variant.is_indel: stats['indels'] += 1 else: stats['other'] += 1 print(f'SNPs: {stats["snps"]}') print(f'Indels: {stats["indels"]}') print(f'Other: {stats["other"]}')
Quality Statistics
from cyvcf2 import VCF import numpy as np quals = [] for variant in VCF('input.vcf.gz'): if variant.QUAL: quals.append(variant.QUAL) quals = np.array(quals) print(f'Mean QUAL: {np.mean(quals):.1f}') print(f'Median QUAL: {np.median(quals):.1f}') print(f'Min QUAL: {np.min(quals):.1f}') print(f'Max QUAL: {np.max(quals):.1f}')
Genotype Distribution
from cyvcf2 import VCF vcf = VCF('input.vcf.gz') samples = vcf.samples hom_ref = [0] * len(samples) het = [0] * len(samples) hom_alt = [0] * len(samples) missing = [0] * len(samples) for variant in vcf: for i, gt in enumerate(variant.gt_types): if gt == 0: hom_ref[i] += 1 elif gt == 1: het[i] += 1 elif gt == 3: hom_alt[i] += 1 else: missing[i] += 1 for i, sample in enumerate(samples): print(f'{sample}: HOM_REF={hom_ref[i]}, HET={het[i]}, HOM_ALT={hom_alt[i]}, MISS={missing[i]}')
Transition/Transversion Calculation
from cyvcf2 import VCF transitions = 0 transversions = 0 ti_pairs = {('A', 'G'), ('G', 'A'), ('C', 'T'), ('T', 'C')} for variant in VCF('input.vcf.gz'): if not variant.is_snp: continue ref = variant.REF alt = variant.ALT[0] if (ref, alt) in ti_pairs: transitions += 1 else: transversions += 1 ratio = transitions / transversions if transversions > 0 else 0 print(f'Transitions: {transitions}') print(f'Transversions: {transversions}') print(f'Ti/Tv ratio: {ratio:.2f}')
Common Workflows
Quality Control Report
# Generate stats bcftools stats input.vcf.gz > stats.txt # Extract key metrics echo "=== VCF Summary ===" grep "^SN" stats.txt | cut -f3- echo "" echo "=== Ti/Tv Ratio ===" grep "^TSTV" stats.txt | cut -f5 # Generate plots plot-vcfstats -p qc_plots stats.txt
Compare Before/After Filtering
bcftools stats raw.vcf.gz filtered.vcf.gz > comparison.txt echo "=== Before Filtering ===" grep "^SN.*raw" comparison.txt | cut -f3- echo "" echo "=== After Filtering ===" grep "^SN.*filtered" comparison.txt | cut -f3-
Sample Relatedness Check
bcftools gtcheck -G 1 cohort.vcf.gz > relatedness.txt cat relatedness.txt
Quick Reference
| Task | Command |
|---|---|
| Full stats | |
| Summary only | |
| Ti/Tv ratio | |
| Per-sample | |
| Compare VCFs | |
| Sample check | |
| Plot stats | |
Common Errors
| Error | Cause | Solution |
|---|---|---|
| Empty VCF | Check if VCF has variants |
| Not installed | Install with bcftools |
| Invalid VCF | Check file format |
Related Skills
- vcf-basics - View and query VCF files
- filtering-best-practices - Evaluate filter impact
- vcf-manipulation - Compare call sets
- variant-calling - Assess calling quality