LLMs-Universal-Life-Science-and-Clinical-Skills- atac-qc

<!--

install

source · Clone the upstream repo

git clone https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills-

Claude Code · Install into ~/.claude/skills/

T=$(mktemp -d) && git clone --depth=1 https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills- "$T" && mkdir -p ~/.claude/skills && cp -r "$T/Skills/Epigenomics/atac-seq/atac-qc" ~/.claude/skills/mdbabumiamssm-llms-universal-life-science-and-clinical-skills-atac-qc && rm -rf "$T"

manifest: Skills/Epigenomics/atac-seq/atac-qc/SKILL.md

source content

name: bio-atac-seq-atac-qc description: Quality control metrics for ATAC-seq data including fragment size distribution, TSS enrichment, FRiP, and library complexity. Use when assessing ATAC-seq library quality before or after peak calling to identify problematic samples. tool_type: mixed primary_tool: deeptools measurable_outcome: Execute skill workflow successfully with valid output within 15 minutes. allowed-tools:

read_file
run_shell_command

ATAC-seq Quality Control

Fragment Size Distribution

# Using Picard
java -jar picard.jar CollectInsertSizeMetrics \
    I=sample.bam \
    O=insert_sizes.txt \
    H=insert_sizes.pdf \
    M=0.5

# Using samtools
samtools view -f 66 sample.bam | \
    awk '{print sqrt($9^2)}' | \
    sort | uniq -c | \
    awk '{print $2"\t"$1}' > fragment_sizes.txt

TSS Enrichment Score

# Using deepTools
# 1. Create TSS BED file (from GTF)
awk '$3=="transcript" {print $1"\t"$4-1"\t"$4"\t"$14"\t"0"\t"$7}' genes.gtf | \
    tr -d '";' | sort -k1,1 -k2,2n > tss.bed

# 2. Compute matrix around TSS
computeMatrix reference-point \
    -S sample.bw \
    -R tss.bed \
    -a 2000 -b 2000 \
    -o tss_matrix.gz

# 3. Plot TSS enrichment
plotProfile -m tss_matrix.gz \
    -o tss_enrichment.png \
    --perGroup

Calculate TSS Enrichment Score

import numpy as np
import pyBigWig

def calculate_tss_enrichment(bigwig_file, tss_bed, flank=2000):
    '''Calculate TSS enrichment score.'''
    bw = pyBigWig.open(bigwig_file)

    signals = []
    for line in open(tss_bed):
        fields = line.strip().split('\t')
        chrom, tss = fields[0], int(fields[1])
        strand = fields[5] if len(fields) > 5 else '+'

        try:
            vals = bw.values(chrom, max(0, tss - flank), tss + flank)
            if strand == '-':
                vals = vals[::-1]
            signals.append(vals)
        except:
            continue

    avg_signal = np.nanmean(signals, axis=0)

    # TSS enrichment = signal at TSS / background
    background = np.nanmean([avg_signal[:100], avg_signal[-100:]])
    tss_signal = np.nanmean(avg_signal[flank-50:flank+50])

    enrichment = tss_signal / background if background > 0 else 0

    return enrichment, avg_signal

enrichment, signal = calculate_tss_enrichment('sample.bw', 'tss.bed')
print(f'TSS Enrichment Score: {enrichment:.2f}')

FRiP (Fraction of Reads in Peaks)

# Total reads
total=$(samtools view -c -F 4 sample.bam)

# Reads in peaks
in_peaks=$(bedtools intersect -a sample.bam -b peaks.narrowPeak -u | \
    samtools view -c)

# FRiP
frip=$(echo "scale=4; $in_peaks / $total" | bc)
echo "FRiP: $frip"

# Good FRiP for ATAC-seq: >0.2 (20%)

Mitochondrial Read Fraction

# Mitochondrial reads
mt_reads=$(samtools view -c sample.bam chrM)
total_reads=$(samtools view -c sample.bam)

mt_frac=$(echo "scale=4; $mt_reads / $total_reads" | bc)
echo "Mitochondrial fraction: $mt_frac"

# Ideal: <20%, concerning: >50%

Library Complexity (NRF, PBC1, PBC2)

# Using Picard EstimateLibraryComplexity
java -jar picard.jar EstimateLibraryComplexity \
    I=sample.bam \
    O=complexity.txt

# Or calculate from BAM
# NRF = unique reads / total reads
# PBC1 = locations with exactly 1 read / locations with >= 1 read
# PBC2 = locations with exactly 1 read / locations with exactly 2 reads

import pysam

def calculate_complexity(bam_file):
    '''Calculate library complexity metrics.'''
    bam = pysam.AlignmentFile(bam_file, 'rb')

    positions = {}
    total = 0
    for read in bam.fetch():
        if read.is_unmapped or read.is_secondary:
            continue
        total += 1
        pos = (read.reference_name, read.reference_start)
        positions[pos] = positions.get(pos, 0) + 1

    distinct = len(positions)
    m1 = sum(1 for v in positions.values() if v == 1)
    m2 = sum(1 for v in positions.values() if v == 2)

    nrf = distinct / total if total > 0 else 0
    pbc1 = m1 / distinct if distinct > 0 else 0
    pbc2 = m1 / m2 if m2 > 0 else 0

    return {'NRF': nrf, 'PBC1': pbc1, 'PBC2': pbc2}

deepTools QC

# Fingerprint plot (assesses enrichment)
plotFingerprint \
    -b sample.bam \
    --labels sample \
    -o fingerprint.png

# Correlation between replicates
multiBamSummary bins \
    -b sample1.bam sample2.bam \
    -o results.npz

plotCorrelation \
    -in results.npz \
    --corMethod pearson \
    --whatToPlot heatmap \
    -o correlation.png

ATACseqQC (R)

library(ATACseqQC)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)

# Read BAM
bamfile <- 'sample.bam'

# Fragment size distribution
fragSizeDist(bamfile, 'fragment_size.pdf')

# TSS enrichment
tsse <- TSSEscore(bamfile, TxDb.Hsapiens.UCSC.hg38.knownGene)
print(paste('TSS Enrichment:', round(tsse$TSSEscore, 2)))

# Nucleosome positioning
nucs <- nucleosomePositioningScore(bamfile, TxDb.Hsapiens.UCSC.hg38.knownGene)

Comprehensive QC Report

import subprocess
import pandas as pd

def atac_qc_report(bam_file, peaks_file, output_prefix):
    '''Generate comprehensive ATAC-seq QC report.'''
    metrics = {}

    # Total reads
    result = subprocess.check_output(f'samtools view -c -F 4 {bam_file}', shell=True)
    metrics['total_reads'] = int(result.strip())

    # Mapped reads
    result = subprocess.check_output(f'samtools view -c -F 4 -F 256 {bam_file}', shell=True)
    metrics['mapped_reads'] = int(result.strip())

    # Mitochondrial reads
    result = subprocess.check_output(f'samtools view -c {bam_file} chrM', shell=True)
    metrics['mt_reads'] = int(result.strip())
    metrics['mt_fraction'] = metrics['mt_reads'] / metrics['total_reads']

    # Reads in peaks (FRiP)
    result = subprocess.check_output(
        f'bedtools intersect -a {bam_file} -b {peaks_file} -u | samtools view -c', shell=True)
    metrics['reads_in_peaks'] = int(result.strip())
    metrics['frip'] = metrics['reads_in_peaks'] / metrics['total_reads']

    # Peak count
    result = subprocess.check_output(f'wc -l < {peaks_file}', shell=True)
    metrics['peak_count'] = int(result.strip())

    # Write report
    with open(f'{output_prefix}_qc.txt', 'w') as f:
        for k, v in metrics.items():
            if isinstance(v, float):
                f.write(f'{k}: {v:.4f}\n')
            else:
                f.write(f'{k}: {v}\n')

    return metrics

QC Thresholds

Metric	Good	Acceptable	Poor
TSS Enrichment	>10	5-10	<5
FRiP	>0.3	0.1-0.3	<0.1
MT Fraction	<0.1	0.1-0.3	>0.3
NRF	>0.9	0.8-0.9	<0.8
PBC1	>0.9	0.7-0.9	<0.7

Related Skills

atac-seq/atac-peak-calling - Peak calling
alignment-files/bam-statistics - Alignment QC
chip-seq/chipseq-visualization - Visualization approaches