install
source · Clone the upstream repo
git clone https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills-
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills- "$T" && mkdir -p ~/.claude/skills && cp -r "$T/Skills/Genomics/Single_Cell/preprocessing" ~/.claude/skills/mdbabumiamssm-llms-universal-life-science-and-clinical-skills-preprocessing-68e943 && rm -rf "$T"
manifest:
Skills/Genomics/Single_Cell/preprocessing/SKILL.mdsource content
<!--
# COPYRIGHT NOTICE
# This file is part of the "Universal Biomedical Skills" project.
# Copyright (c) 2026 MD BABU MIA, PhD <md.babu.mia@mssm.edu>
# All Rights Reserved.
#
# This code is proprietary and confidential.
# Unauthorized copying of this file, via any medium is strictly prohibited.
#
# Provenance: Authenticated by MD BABU MIA
-->
name: bio-single-cell-preprocessing description: Quality control, filtering, and normalization for single-cell RNA-seq using Seurat (R) and Scanpy (Python). Use for calculating QC metrics, filtering cells and genes, normalizing counts, identifying highly variable genes, and scaling data. Use when filtering, normalizing, and selecting features in single-cell data. tool_type: mixed primary_tool: Seurat measurable_outcome: Execute skill workflow successfully with valid output within 15 minutes. allowed-tools:
- read_file
- run_shell_command
Single-Cell Preprocessing
Quality control, filtering, normalization, and feature selection for scRNA-seq data.
Scanpy (Python)
Required Imports
import scanpy as sc import numpy as np
Calculate QC Metrics
# Calculate mitochondrial gene percentage adata.var['mt'] = adata.var_names.str.startswith('MT-') sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) # Key metrics added to adata.obs: # - n_genes_by_counts: genes detected per cell # - total_counts: total UMI counts per cell # - pct_counts_mt: percentage mitochondrial
Visualize QC Metrics
import matplotlib.pyplot as plt sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True) sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt') sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')
Filter Cells and Genes
# Filter cells by QC metrics sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_cells(adata, max_genes=5000) # Filter by mitochondrial percentage adata = adata[adata.obs['pct_counts_mt'] < 20, :].copy() # Filter genes sc.pp.filter_genes(adata, min_cells=3) print(f'After filtering: {adata.n_obs} cells, {adata.n_vars} genes')
Store Raw Counts
# Store raw counts before normalization adata.raw = adata.copy() # Or use layers adata.layers['counts'] = adata.X.copy()
Normalization
# Library size normalization (normalize to 10,000 counts per cell) sc.pp.normalize_total(adata, target_sum=1e4) # Log transform sc.pp.log1p(adata)
Highly Variable Genes
# Identify highly variable genes (default: top 2000) sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor='seurat_v3', layer='counts') # Visualize sc.pl.highly_variable_genes(adata) # Check results print(f'Highly variable genes: {adata.var.highly_variable.sum()}')
Subset to HVGs (Optional)
# Keep only highly variable genes for downstream analysis adata_hvg = adata[:, adata.var.highly_variable].copy()
Scaling (Z-score)
# Scale to unit variance and zero mean sc.pp.scale(adata, max_value=10)
Regress Out Confounders
# Regress out unwanted variation (e.g., cell cycle, mitochondrial) sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
Complete Preprocessing Pipeline
import scanpy as sc adata = sc.read_10x_mtx('filtered_feature_bc_matrix/') # QC adata.var['mt'] = adata.var_names.str.startswith('MT-') sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True) # Filter sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=3) adata = adata[adata.obs['pct_counts_mt'] < 20, :].copy() # Store raw adata.raw = adata.copy() # Normalize sc.pp.normalize_total(adata, target_sum=1e4) sc.pp.log1p(adata) # HVGs sc.pp.highly_variable_genes(adata, n_top_genes=2000) # Scale adata = adata[:, adata.var.highly_variable].copy() sc.pp.scale(adata, max_value=10)
Seurat (R)
Required Libraries
library(Seurat) library(ggplot2)
Calculate QC Metrics
# Calculate mitochondrial percentage seurat_obj[['percent.mt']] <- PercentageFeatureSet(seurat_obj, pattern = '^MT-') # View QC metrics head(seurat_obj@meta.data)
Visualize QC Metrics
# Violin plots VlnPlot(seurat_obj, features = c('nFeature_RNA', 'nCount_RNA', 'percent.mt'), ncol = 3) # Scatter plots plot1 <- FeatureScatter(seurat_obj, feature1 = 'nCount_RNA', feature2 = 'percent.mt') plot2 <- FeatureScatter(seurat_obj, feature1 = 'nCount_RNA', feature2 = 'nFeature_RNA') plot1 + plot2
Filter Cells
# Filter by QC metrics seurat_obj <- subset(seurat_obj, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 20) cat('After filtering:', ncol(seurat_obj), 'cells\n')
Normalization (Log Normalization)
# Standard log normalization seurat_obj <- NormalizeData(seurat_obj, normalization.method = 'LogNormalize', scale.factor = 10000)
Normalization (SCTransform)
# SCTransform - recommended for most workflows # Combines normalization, scaling, and HVG selection seurat_obj <- SCTransform(seurat_obj, vars.to.regress = 'percent.mt', verbose = FALSE)
Find Variable Features
# Identify highly variable features (if not using SCTransform) seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = 'vst', nfeatures = 2000) # Visualize top10 <- head(VariableFeatures(seurat_obj), 10) plot1 <- VariableFeaturePlot(seurat_obj) plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE) plot2
Scaling
# Scale data (if not using SCTransform) all.genes <- rownames(seurat_obj) seurat_obj <- ScaleData(seurat_obj, features = all.genes) # Or scale only variable features (faster) seurat_obj <- ScaleData(seurat_obj)
Regress Out Confounders
# Regress out unwanted variation during scaling seurat_obj <- ScaleData(seurat_obj, vars.to.regress = c('percent.mt', 'nCount_RNA'))
Complete Preprocessing Pipeline (Log Normalization)
library(Seurat) counts <- Read10X(data.dir = 'filtered_feature_bc_matrix/') seurat_obj <- CreateSeuratObject(counts = counts, min.cells = 3, min.features = 200) # QC seurat_obj[['percent.mt']] <- PercentageFeatureSet(seurat_obj, pattern = '^MT-') # Filter seurat_obj <- subset(seurat_obj, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 20) # Normalize seurat_obj <- NormalizeData(seurat_obj) # HVGs seurat_obj <- FindVariableFeatures(seurat_obj, nfeatures = 2000) # Scale seurat_obj <- ScaleData(seurat_obj)
Complete Preprocessing Pipeline (SCTransform)
library(Seurat) counts <- Read10X(data.dir = 'filtered_feature_bc_matrix/') seurat_obj <- CreateSeuratObject(counts = counts, min.cells = 3, min.features = 200) # QC seurat_obj[['percent.mt']] <- PercentageFeatureSet(seurat_obj, pattern = '^MT-') # Filter seurat_obj <- subset(seurat_obj, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 20) # SCTransform (does normalization, HVG, and scaling) seurat_obj <- SCTransform(seurat_obj, vars.to.regress = 'percent.mt', verbose = FALSE)
QC Thresholds Reference
| Metric | Typical Range | Notes |
|---|---|---|
| min_genes | 200-500 | Remove empty droplets |
| max_genes | 2500-5000 | Remove doublets |
| max_mt | 5-20% | Remove dying cells (tissue-dependent) |
| min_cells | 3-10 | Remove rarely detected genes |
Method Comparison
| Step | Scanpy | Seurat (Standard) | Seurat (SCTransform) |
|---|---|---|---|
| Normalize | + | | |
| HVGs | | | (included) |
| Scale | | | (included) |
| Regress | | | |
Related Skills
- data-io - Load data before preprocessing
- clustering - PCA and clustering after preprocessing
- markers-annotation - Find markers after clustering