install
source · Clone the upstream repo
git clone https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills-
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills- "$T" && mkdir -p ~/.claude/skills && cp -r "$T/Skills/Proteomics/bioSkills/proteomics-qc" ~/.claude/skills/mdbabumiamssm-llms-universal-life-science-and-clinical-skills-proteomics-qc && rm -rf "$T"
manifest:
Skills/Proteomics/bioSkills/proteomics-qc/SKILL.mdsource content
<!--
# COPYRIGHT NOTICE
# This file is part of the "Universal Biomedical Skills" project.
# Copyright (c) 2026 MD BABU MIA, PhD <md.babu.mia@mssm.edu>
# All Rights Reserved.
#
# This code is proprietary and confidential.
# Unauthorized copying of this file, via any medium is strictly prohibited.
#
# Provenance: Authenticated by MD BABU MIA
-->
name: bio-proteomics-proteomics-qc description: Quality control and assessment for proteomics data. Use when evaluating proteomics data quality before downstream analysis. Covers sample metrics, missing value patterns, replicate correlation, batch effects, and intensity distributions. tool_type: mixed primary_tool: pandas measurable_outcome: Execute skill workflow successfully with valid output within 15 minutes. allowed-tools:
- read_file
- run_shell_command
Proteomics Quality Control
Sample Quality Metrics
import pandas as pd import numpy as np def sample_qc_metrics(intensity_matrix): '''Calculate per-sample QC metrics''' metrics = pd.DataFrame(index=intensity_matrix.columns) metrics['n_proteins'] = intensity_matrix.notna().sum() metrics['median_intensity'] = intensity_matrix.median() metrics['mean_intensity'] = intensity_matrix.mean() metrics['cv'] = intensity_matrix.std() / intensity_matrix.mean() metrics['missing_pct'] = 100 * intensity_matrix.isna().sum() / len(intensity_matrix) return metrics qc = sample_qc_metrics(log2_intensities) print(qc)
Replicate Correlation
import seaborn as sns import matplotlib.pyplot as plt from scipy.stats import pearsonr def replicate_correlation(intensity_matrix, sample_groups): '''Calculate within-group correlations''' corr_matrix = intensity_matrix.corr(method='pearson') # Mask for within-group comparisons results = [] for group in sample_groups.unique(): group_samples = sample_groups[sample_groups == group].index for i, s1 in enumerate(group_samples): for s2 in group_samples[i+1:]: r = corr_matrix.loc[s1, s2] results.append({'group': group, 'sample1': s1, 'sample2': s2, 'correlation': r}) return pd.DataFrame(results) # Heatmap sns.clustermap(intensity_matrix.corr(), cmap='RdBu_r', center=0, vmin=-1, vmax=1, figsize=(10, 10), annot=False) plt.savefig('correlation_heatmap.pdf')
Missing Value Patterns
import missingno as msno def analyze_missing_patterns(intensity_matrix): '''Analyze missing value patterns''' # Missing value matrix visualization msno.matrix(intensity_matrix, figsize=(12, 8)) plt.savefig('missing_pattern.pdf') # Missing by sample missing_per_sample = intensity_matrix.isna().sum() / len(intensity_matrix) * 100 # Missing by protein missing_per_protein = intensity_matrix.isna().sum(axis=1) / intensity_matrix.shape[1] * 100 # Check for systematic patterns return {'per_sample': missing_per_sample, 'per_protein': missing_per_protein}
Batch Effect Detection with PCA
from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA def detect_batch_effects(intensity_matrix, sample_info, batch_col='batch'): '''PCA to detect batch effects''' # Impute for PCA (temporary) imputed = intensity_matrix.fillna(intensity_matrix.median()) scaled = StandardScaler().fit_transform(imputed.T) pca = PCA(n_components=5) pcs = pca.fit_transform(scaled) pc_df = pd.DataFrame(pcs, columns=[f'PC{i+1}' for i in range(5)], index=intensity_matrix.columns) pc_df = pc_df.join(sample_info) # Check batch association with PCs from scipy.stats import f_oneway for pc in ['PC1', 'PC2', 'PC3']: groups = [pc_df[pc_df[batch_col] == b][pc] for b in pc_df[batch_col].unique()] stat, pval = f_oneway(*groups) print(f'{pc} ~ {batch_col}: F={stat:.2f}, p={pval:.4f}') return pc_df, pca.explained_variance_ratio_
R: QC with limma
library(limma) library(ggplot2) # Intensity distribution plotDensities(protein_matrix, legend = FALSE, main = 'Intensity Distributions') # MA plots between samples for (i in 2:ncol(protein_matrix)) { plotMA(protein_matrix[, c(1, i)], main = paste('MA:', colnames(protein_matrix)[i])) } # MDS plot (similar to PCA) plotMDS(protein_matrix, col = as.numeric(sample_info$condition))
Coefficient of Variation
def calculate_cv(intensity_matrix, sample_groups): '''Calculate CV within groups''' cv_results = [] for group in sample_groups.unique(): group_samples = sample_groups[sample_groups == group].index group_data = intensity_matrix[group_samples] # CV per protein cv = group_data.std(axis=1) / group_data.mean(axis=1) * 100 cv_results.append({'group': group, 'median_cv': cv.median(), 'mean_cv': cv.mean()}) return pd.DataFrame(cv_results) # Technical replicates should have CV < 20% # Biological replicates typically 20-40%
Digestion Efficiency
def check_digestion(evidence_df): '''Check digestion efficiency from MaxQuant evidence.txt''' # Missed cleavages distribution mc_dist = evidence_df['Missed cleavages'].value_counts(normalize=True) * 100 print('Missed cleavage distribution:') print(mc_dist) # Good digestion: >80% with 0 missed cleavages if mc_dist.get(0, 0) < 80: print('Warning: Poor digestion efficiency (<80% fully cleaved)') return mc_dist
QC Report Summary
def generate_qc_report(intensity_matrix, sample_info): '''Generate comprehensive QC summary''' report = { 'n_samples': intensity_matrix.shape[1], 'n_proteins': intensity_matrix.shape[0], 'median_proteins_per_sample': intensity_matrix.notna().sum().median(), 'overall_missing_pct': 100 * intensity_matrix.isna().sum().sum() / intensity_matrix.size, 'median_correlation': intensity_matrix.corr().values[np.triu_indices_from(intensity_matrix.corr(), k=1)].mean(), } # Flags report['flags'] = [] if report['overall_missing_pct'] > 30: report['flags'].append('High missing values (>30%)') if report['median_correlation'] < 0.9: report['flags'].append('Low replicate correlation (<0.9)') return report
Related Skills
- data-import - Load data before QC
- quantification - Normalization after QC
- differential-abundance - Analysis after QC passes
- data-visualization/heatmaps-clustering - QC heatmaps