LLMs-Universal-Life-Science-and-Clinical-Skills- tumor-mutational-burden

<!--

install
source · Clone the upstream repo
git clone https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills-
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills- "$T" && mkdir -p ~/.claude/skills && cp -r "$T/Skills/Clinical/Clinical_Databases/tumor-mutational-burden" ~/.claude/skills/mdbabumiamssm-llms-universal-life-science-and-clinical-skills-tumor-mutational-b && rm -rf "$T"
manifest: Skills/Clinical/Clinical_Databases/tumor-mutational-burden/SKILL.md
source content
<!-- # COPYRIGHT NOTICE # This file is part of the "Universal Biomedical Skills" project. # Copyright (c) 2026 MD BABU MIA, PhD <md.babu.mia@mssm.edu> # All Rights Reserved. # # This code is proprietary and confidential. # Unauthorized copying of this file, via any medium is strictly prohibited. # # Provenance: Authenticated by MD BABU MIA -->

name: bio-clinical-databases-tumor-mutational-burden description: Calculate tumor mutational burden from panel or WES data with proper normalization and clinical thresholds. Use when assessing immunotherapy eligibility or characterizing tumor immunogenicity. tool_type: python primary_tool: cyvcf2 measurable_outcome: Execute skill workflow successfully with valid output within 15 minutes. allowed-tools:

  • read_file
  • run_shell_command

Tumor Mutational Burden

TMB Calculation from VCF

from cyvcf2 import VCF

def calculate_tmb(vcf_path, panel_size_mb):
    '''Calculate TMB (mutations per megabase)

    Args:
        vcf_path: Path to somatic VCF
        panel_size_mb: Capture region size in megabases

    Returns:
        TMB value (mutations/Mb)
    '''
    vcf = VCF(vcf_path)
    mutation_count = 0

    for variant in vcf:
        # Count nonsynonymous coding mutations
        # Adjust filters based on VCF annotation format
        if is_coding_nonsynonymous(variant):
            mutation_count += 1

    tmb = mutation_count / panel_size_mb
    return tmb

def is_coding_nonsynonymous(variant):
    '''Check if variant is coding nonsynonymous

    Adjust logic based on your VCF annotation tool:
    - VEP: CSQ field
    - SnpEff: ANN field
    - Funcotator: FUNCOTATION field
    '''
    # Example for VEP annotation
    csq = variant.INFO.get('CSQ', '')
    if not csq:
        return False

    # Check consequence types
    nonsynonymous = ['missense_variant', 'nonsense', 'frameshift',
                     'inframe_insertion', 'inframe_deletion', 'stop_gained',
                     'stop_lost', 'start_lost']

    for transcript in csq.split(','):
        fields = transcript.split('|')
        consequence = fields[1] if len(fields) > 1 else ''
        if any(ns in consequence for ns in nonsynonymous):
            return True
    return False

Panel-Specific TMB

# Common panel sizes (in megabases)
# Check your specific panel's capture region size
PANEL_SIZES_MB = {
    'FoundationOne CDx': 0.8,
    'MSK-IMPACT': 1.14,
    'TruSight Oncology 500': 1.94,
    'Oncomine Comprehensive': 1.5,
    'WES (exome)': 30.0,  # Approximate coding region
    'WGS': 3000.0,        # Approximate
}

def calculate_tmb_panel(vcf_path, panel_name):
    '''Calculate TMB for known panel'''
    if panel_name not in PANEL_SIZES_MB:
        raise ValueError(f'Unknown panel: {panel_name}')
    return calculate_tmb(vcf_path, PANEL_SIZES_MB[panel_name])

TMB with Variant Filtering

def calculate_tmb_filtered(vcf_path, panel_size_mb, min_vaf=0.05, min_depth=100):
    '''Calculate TMB with quality filters

    Args:
        vcf_path: Path to somatic VCF
        panel_size_mb: Panel size in Mb
        min_vaf: Minimum variant allele frequency (default 5%)
        min_depth: Minimum read depth (default 100)

    Filters:
    - VAF >= 5%: Reduce false positives from sequencing errors
    - Depth >= 100: Ensure reliable variant calls
    - Exclude known polymorphisms (gnomAD AF > 1%)
    - Include only coding nonsynonymous
    '''
    vcf = VCF(vcf_path)
    mutation_count = 0

    for variant in vcf:
        # Quality filters
        depth = variant.INFO.get('DP', 0)
        vaf = get_vaf(variant)

        if depth < min_depth:
            continue
        if vaf < min_vaf:
            continue

        # Exclude germline polymorphisms
        gnomad_af = variant.INFO.get('gnomAD_AF', 0)
        if gnomad_af > 0.01:
            continue

        # Count coding nonsynonymous
        if is_coding_nonsynonymous(variant):
            mutation_count += 1

    return mutation_count / panel_size_mb

def get_vaf(variant):
    '''Extract variant allele frequency from variant'''
    # Format depends on caller (e.g., Mutect2, Strelka)
    # Mutect2 format: AD field in genotype
    try:
        ad = variant.format('AD')[0]  # First sample
        if sum(ad) > 0:
            return ad[1] / sum(ad)
    except:
        pass
    return 0

Clinical TMB Thresholds

def classify_tmb(tmb_value, threshold='FDA'):
    '''Classify TMB as high or low

    Clinical thresholds:
    - FDA (pembrolizumab): 10 mut/Mb
    - ESMO: 10 mut/Mb
    - Some studies use 16, 20 mut/Mb for specific cancers

    Note: Panel-specific thresholds may differ
    '''
    thresholds = {
        'FDA': 10,
        'conservative': 16,
        'strict': 20
    }

    cutoff = thresholds.get(threshold, 10)

    if tmb_value >= cutoff:
        return 'TMB-High'
    else:
        return 'TMB-Low'

# Example
tmb = 12.5
status = classify_tmb(tmb)
print(f'TMB: {tmb} mut/Mb -> {status}')

TMB by Variant Type

def detailed_tmb_analysis(vcf_path, panel_size_mb):
    '''Calculate TMB broken down by variant type'''
    vcf = VCF(vcf_path)

    counts = {
        'missense': 0,
        'nonsense': 0,
        'frameshift': 0,
        'inframe_indel': 0,
        'splice': 0,
        'synonymous': 0,
        'other': 0
    }

    for variant in vcf:
        vtype = classify_variant_type(variant)
        counts[vtype] = counts.get(vtype, 0) + 1

    # TMB typically excludes synonymous
    nonsynonymous_count = sum(v for k, v in counts.items()
                               if k != 'synonymous' and k != 'other')

    results = {
        'counts': counts,
        'total_nonsynonymous': nonsynonymous_count,
        'tmb': nonsynonymous_count / panel_size_mb,
        'panel_size_mb': panel_size_mb
    }
    return results

TMB vs MSI Comparison

def tmb_msi_concordance(tmb_value, msi_status):
    '''Compare TMB with MSI status

    MSI-H tumors typically have high TMB (>10-20 mut/Mb)
    TMB-H and MSI-H are correlated but not identical:
    - ~80% MSI-H are TMB-H
    - Many TMB-H are MSS (especially smoking-related)

    Both predict immunotherapy response
    '''
    tmb_high = tmb_value >= 10

    if msi_status == 'MSI-H' and tmb_high:
        return 'Concordant TMB-H/MSI-H'
    elif msi_status == 'MSI-H' and not tmb_high:
        return 'Discordant MSI-H/TMB-L (uncommon)'
    elif msi_status == 'MSS' and tmb_high:
        return 'TMB-H/MSS (e.g., smoking-related)'
    else:
        return 'TMB-L/MSS'

Batch TMB Calculation

import pandas as pd
from pathlib import Path

def batch_tmb(vcf_dir, panel_size_mb, output_file):
    '''Calculate TMB for multiple samples'''
    results = []

    for vcf_path in Path(vcf_dir).glob('*.vcf.gz'):
        sample_id = vcf_path.stem.replace('.vcf', '')
        tmb = calculate_tmb_filtered(str(vcf_path), panel_size_mb)
        status = classify_tmb(tmb)

        results.append({
            'sample': sample_id,
            'tmb': round(tmb, 2),
            'status': status
        })

    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)
    return df

Related Skills

  • variant-calling/somatic-variant-calling - Input variants
  • variant-calling/clinical-interpretation - ACMG/AMP classification
  • variant-calling/variant-annotation - VEP/SnpEff annotation
<!-- AUTHOR_SIGNATURE: 9a7f3c2e-MD-BABU-MIA-2026-MSSM-SECURE -->