LLMs-Universal-Life-Science-and-Clinical-Skills- neoantigen-prediction

<!--

install
source · Clone the upstream repo
git clone https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills-
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills- "$T" && mkdir -p ~/.claude/skills && cp -r "$T/Skills/Immunology_Vaccines/bioSkills/neoantigen-prediction" ~/.claude/skills/mdbabumiamssm-llms-universal-life-science-and-clinical-skills-neoantigen-predict && rm -rf "$T"
manifest: Skills/Immunology_Vaccines/bioSkills/neoantigen-prediction/SKILL.md
source content
<!-- # COPYRIGHT NOTICE # This file is part of the "Universal Biomedical Skills" project. # Copyright (c) 2026 MD BABU MIA, PhD <md.babu.mia@mssm.edu> # All Rights Reserved. # # This code is proprietary and confidential. # Unauthorized copying of this file, via any medium is strictly prohibited. # # Provenance: Authenticated by MD BABU MIA -->

name: bio-immunoinformatics-neoantigen-prediction description: Identify tumor neoantigens from somatic mutations using pVACtools for personalized cancer immunotherapy. Predict mutant peptides that bind patient HLA and may elicit T-cell responses. Use when identifying vaccine targets or checkpoint inhibitor response biomarkers from tumor sequencing data. tool_type: python primary_tool: pVACtools measurable_outcome: Execute skill workflow successfully with valid output within 15 minutes. allowed-tools:

  • read_file
  • run_shell_command

Neoantigen Prediction

pVACtools Pipeline

# Install pVACtools
pip install pvactools

# Or use conda for dependencies
conda create -n pvactools python=3.8
conda activate pvactools
pip install pvactools

# Download IEDB tools
pvactools download_iedb_tools

pVACseq Workflow

# Run pVACseq on annotated VCF
pvacseq run \
    annotated.vcf \
    sample_name \
    "HLA-A*02:01,HLA-A*24:02,HLA-B*07:02,HLA-B*44:02" \
    MHCflurry MHCnuggetsI \
    output_dir \
    -e1 8,9,10,11 \
    --iedb-install-directory /path/to/iedb

# Key parameters:
# -e1: Epitope lengths for MHC-I (8-11)
# -e2: Epitope lengths for MHC-II (15)
# --binding-threshold: IC50 cutoff (default 500)
# --percentile-threshold: Alternative cutoff

VCF Annotation Requirements

# pVACseq requires VEP-annotated VCF
# Must include transcript and amino acid changes

# Run VEP first
vep -i somatic.vcf -o annotated.vcf \
    --cache --offline \
    --format vcf --vcf \
    --plugin Downstream \
    --plugin Wildtype \
    --terms SO \
    --symbol

Parse pVACseq Results

import pandas as pd

def parse_pvacseq_results(results_file):
    '''Parse pVACseq output

    Key columns:
    - Mutation: Gene and amino acid change
    - HLA Allele: Patient HLA presenting this peptide
    - MT Epitope Seq: Mutant peptide sequence
    - WT Epitope Seq: Wild-type peptide sequence
    - Median MT Score: Binding affinity (nM)
    - Median WT Score: WT binding (for agretopicity)
    - Tumor DNA VAF: Variant allele frequency
    - Gene Expression: If RNA-seq available
    '''
    df = pd.read_csv(results_file, sep='\t')

    # Filter by binding threshold
    strong_binders = df[df['Median MT Score'] < 500]

    return strong_binders


def calculate_agretopicity(df):
    '''Calculate agretopicity (DAI) score

    Agretopicity = ratio of WT to MT binding
    Higher agretopicity means MT binds better than WT
    indicating mutation creates new epitope

    DAI (Differential Agretopicity Index):
    - >1: Mutant binds better (favorable)
    - ~1: Similar binding (less likely immunogenic)
    - <1: WT binds better (unfavorable)
    '''
    df = df.copy()
    df['agretopicity'] = df['Median WT Score'] / df['Median MT Score']

    # High agretopicity = mutation improves binding
    df['dai_favorable'] = df['agretopicity'] > 1

    return df

Prioritize Neoantigens

def prioritize_neoantigens(df, vaf_threshold=0.1, expression_threshold=1.0):
    '''Prioritize neoantigens for vaccine design

    Criteria for good neoantigen candidates:
    1. Strong MHC binding (IC50 < 500nM, ideally < 50nM)
    2. High agretopicity (MT binds better than WT)
    3. High tumor VAF (clonal, present in most tumor cells)
    4. Expressed in tumor (if RNA-seq available)
    5. Not in tolerogenic region (self-similarity check)

    Typical pipeline returns 10-50 candidates per patient
    '''
    candidates = df.copy()

    # Filter by binding
    candidates = candidates[candidates['Median MT Score'] < 500]

    # Filter by VAF (clonal mutations preferred)
    if 'Tumor DNA VAF' in candidates.columns:
        candidates = candidates[candidates['Tumor DNA VAF'] >= vaf_threshold]

    # Filter by expression
    if 'Gene Expression' in candidates.columns:
        candidates = candidates[candidates['Gene Expression'] >= expression_threshold]

    # Calculate priority score
    # Lower binding affinity = better
    # Higher VAF = better
    # Higher agretopicity = better
    candidates['priority_score'] = (
        (1 / candidates['Median MT Score']) *
        candidates.get('Tumor DNA VAF', 1) *
        candidates.get('agretopicity', 1)
    )

    return candidates.sort_values('priority_score', ascending=False)

Alternative: Manual Neoantigen Pipeline

def manual_neoantigen_pipeline(vcf_file, hla_alleles, reference_fasta):
    '''Simplified neoantigen prediction without pVACtools

    Steps:
    1. Extract coding mutations from VCF
    2. Generate mutant protein sequences
    3. Extract peptides around mutation
    4. Predict MHC binding
    '''
    from cyvcf2 import VCF
    from mhcflurry import Class1PresentationPredictor

    vcf = VCF(vcf_file)
    predictor = Class1PresentationPredictor.load()

    neoantigens = []

    for variant in vcf:
        # Get amino acid change from VEP annotation
        if 'CSQ' not in variant.INFO:
            continue

        # Parse consequence and extract mutant peptides
        # ... (implementation depends on annotation format)

        # For each mutant peptide, predict binding
        for peptide in mutant_peptides:
            for allele in hla_alleles:
                pred = predictor.predict(peptides=[peptide], alleles=[allele])
                if pred['mhcflurry_affinity'].values[0] < 500:
                    neoantigens.append({
                        'variant': f'{variant.CHROM}:{variant.POS}',
                        'peptide': peptide,
                        'allele': allele,
                        'affinity': pred['mhcflurry_affinity'].values[0]
                    })

    return neoantigens

Neoantigen Quality Metrics

def assess_neoantigen_quality(neoantigen):
    '''Assess multiple quality metrics for neoantigen

    Returns composite quality score considering:
    - Binding affinity
    - Agretopicity
    - Clonality (VAF)
    - Expression
    - Self-similarity
    '''
    scores = {}

    # Binding (0-1, lower IC50 = higher score)
    ic50 = neoantigen.get('Median MT Score', 500)
    scores['binding'] = 1 - min(ic50 / 5000, 1)

    # Agretopicity (0-1)
    dai = neoantigen.get('agretopicity', 1)
    scores['agretopicity'] = min(dai / 10, 1)

    # Clonality (0-1)
    vaf = neoantigen.get('Tumor DNA VAF', 0.5)
    scores['clonality'] = vaf

    # Expression (0-1, log scale)
    import math
    expr = neoantigen.get('Gene Expression', 1)
    scores['expression'] = min(math.log10(expr + 1) / 3, 1)

    # Composite score
    weights = {'binding': 0.3, 'agretopicity': 0.3, 'clonality': 0.2, 'expression': 0.2}
    composite = sum(scores[k] * weights[k] for k in weights)

    return composite, scores

Related Skills

  • immunoinformatics/mhc-binding-prediction - MHC binding details
  • immunoinformatics/immunogenicity-scoring - Prioritization
  • variant-calling/variant-calling - Input somatic mutations
<!-- AUTHOR_SIGNATURE: 9a7f3c2e-MD-BABU-MIA-2026-MSSM-SECURE -->