LLMs-Universal-Life-Science-and-Clinical-Skills- perturb-seq

<!--

install
source · Clone the upstream repo
git clone https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills-
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills- "$T" && mkdir -p ~/.claude/skills && cp -r "$T/Skills/Genomics/Single_Cell/perturb-seq" ~/.claude/skills/mdbabumiamssm-llms-universal-life-science-and-clinical-skills-perturb-seq-1de5ae && rm -rf "$T"
manifest: Skills/Genomics/Single_Cell/perturb-seq/SKILL.md
source content
<!-- # COPYRIGHT NOTICE # This file is part of the "Universal Biomedical Skills" project. # Copyright (c) 2026 MD BABU MIA, PhD <md.babu.mia@mssm.edu> # All Rights Reserved. # # This code is proprietary and confidential. # Unauthorized copying of this file, via any medium is strictly prohibited. # # Provenance: Authenticated by MD BABU MIA -->

name: bio-single-cell-perturb-seq description: Analyze Perturb-seq and CROP-seq CRISPR screening data integrated with scRNA-seq. Use when identifying gene function through pooled genetic perturbations in single cells. tool_type: python primary_tool: Pertpy measurable_outcome: Execute skill workflow successfully with valid output within 15 minutes. allowed-tools:

  • read_file
  • run_shell_command

Perturb-seq Analysis

Load and Annotate Perturbations

import scanpy as sc
import pertpy as pt

adata = sc.read_h5ad('perturb_seq.h5ad')

# Guide assignments typically stored in obs
# Format: cell barcode -> guide identity -> target gene
adata.obs['guide_id'] = guide_assignments['guide_id']
adata.obs['target_gene'] = guide_assignments['target_gene']

# Mark non-targeting controls
adata.obs['is_control'] = adata.obs['target_gene'] == 'non-targeting'

Pertpy Analysis

# Initialize perturbation analysis
ps = pt.tl.PerturbationSpace(adata)

# Differential expression per perturbation vs control
de = pt.tl.PseudobulkDE(adata)
de.fit(
    groupby='target_gene',
    control='non-targeting',
    n_threads=8
)
results = de.results()

# Filter significant genes
sig_results = results[results['pval_adj'] < 0.05]

# Perturbation signatures (effect sizes)
ps = pt.tl.PerturbationSignature(adata)
ps.compute(groupby='target_gene', control='non-targeting')

# Get signature matrix
signatures = ps.get_signature_matrix()

Perturbation Embedding

# Compute perturbation-level embeddings
pt.tl.perturbation_embedding(adata, groupby='target_gene', method='mean')

# Cluster perturbations by phenotype
pt.tl.cluster_perturbations(adata, resolution=0.5)

# Find functionally related perturbations
pt.pl.perturbation_heatmap(adata, groupby='perturbation_cluster')

Mixscape (Seurat v5)

library(Seurat)
library(SeuratObject)

# Load Perturb-seq data
seurat <- Read10X('filtered_feature_bc_matrix/')
seurat <- CreateSeuratObject(seurat)

# Add perturbation metadata
seurat <- AddMetaData(seurat, metadata = perturbation_calls)

# Standard preprocessing
seurat <- NormalizeData(seurat)
seurat <- FindVariableFeatures(seurat)
seurat <- ScaleData(seurat)
seurat <- RunPCA(seurat)
seurat <- RunUMAP(seurat, dims = 1:30)

# Mixscape: Classify perturbed vs non-perturbed cells
seurat <- CalcPerturbSig(
    seurat,
    assay = 'RNA',
    slot = 'data',
    new.assay.name = 'PRTB',
    gd.class = 'gene',
    nt.cell.class = 'NT',
    num.neighbors = 20,
    reduction = 'pca',
    ndims = 15
)

# Run Mixscape classification
seurat <- RunMixscape(
    seurat,
    assay = 'PRTB',
    slot = 'scale.data',
    labels = 'gene',
    nt.class.name = 'NT',
    min.de.genes = 5,
    iter.num = 10,
    de.assay = 'RNA',
    prtb.type = 'KO'
)

# View classification results
table(seurat$mixscape_class.global)

Mixscape Visualization

# UMAP colored by perturbation
DimPlot(seurat, reduction = 'umap', group.by = 'mixscape_class', label = TRUE)

# Perturbation score distribution
VlnPlot(seurat, features = 'mixscape_class_p_ko', group.by = 'gene')

# DE genes for each perturbation
MixscapeHeatmap(seurat, ident.1 = 'TP53', ident.2 = 'NT', balanced = TRUE)

# LDA projection
seurat <- MixscapeLDA(seurat, labels = 'gene', nt.class.name = 'NT')
LDAPlot(seurat)

Guide Assignment from CRISPR Feature Barcode

import pandas as pd

# From Cell Ranger output (CRISPR Guide Capture)
guides = pd.read_csv('crispr_analysis/protospacer_calls_per_cell.csv')

# Clean up guide calls
guides['cell_barcode'] = guides['cell_barcode'].str.replace('-1', '')
guides = guides[guides['num_features'] == 1]  # Single guide per cell

# Merge with expression data
adata.obs = adata.obs.merge(
    guides[['cell_barcode', 'feature_call', 'target_gene']],
    left_index=True,
    right_on='cell_barcode',
    how='left'
)

Guide Quality Control

# Check guide representation
guide_counts = adata.obs['target_gene'].value_counts()
print(f'Guides per target: {guide_counts.mean():.1f}')
print(f'Cells per guide: {adata.obs.groupby("guide_id").size().mean():.1f}')

# Filter low-representation guides
# Standard: keep guides with >= 100 cells
min_cells = 100
valid_guides = guide_counts[guide_counts >= min_cells].index
adata = adata[adata.obs['target_gene'].isin(valid_guides)]

# Check for guide bias
sc.pl.violin(adata, keys='n_genes_by_counts', groupby='target_gene', rotation=90)

Multi-Guide Analysis

# Cells with multiple guides (MOI > 1)
multi_guide = adata.obs[adata.obs['num_guides'] > 1]
print(f'Multi-guide cells: {len(multi_guide) / len(adata):.1%}')

# Options:
# 1. Remove multi-guide cells
adata = adata[adata.obs['num_guides'] == 1]

# 2. Keep only cells where guides target same gene
# 3. Analyze combinatorial effects

Pseudobulk Differential Expression

# Aggregate to pseudobulk for robust DE
from pertpy.tools import PseudobulkDE

pb = PseudobulkDE(adata)
pb.fit(
    groupby='target_gene',
    control='non-targeting',
    method='deseq2',  # or 'edger', 'wilcoxon'
    min_cells=50
)

# Get results for specific perturbation
tp53_de = pb.results('TP53')
sig_genes = tp53_de[tp53_de['padj'] < 0.05].sort_values('log2FoldChange')

Pathway Enrichment

import decoupler as dc

# Get DE genes per perturbation
de_results = pb.results()

# Run pathway enrichment
dc.run_ora(
    mat=de_results,
    net=dc.get_resource('MSigDB'),
    source='geneset',
    target='gene'
)

# Visualize top pathways
dc.plot_barplot(de_results, 'TP53', top_n=20)

Screen QC Metrics

MetricGoodAcceptablePoor
Cells per guide>200100-200<100
Guide detection rate>90%80-90%<80%
Non-targeting cells5-15%15-25%>25%
Mixscape KO fraction>50%30-50%<30%

Related Skills

  • single-cell/preprocessing - scRNA-seq preprocessing
  • single-cell/markers-annotation - DE and marker gene concepts
  • single-cell/batch-integration - Multi-sample integration
  • crispr-screens/mageck-analysis - Bulk screen analysis
<!-- AUTHOR_SIGNATURE: 9a7f3c2e-MD-BABU-MIA-2026-MSSM-SECURE -->