install
source · Clone the upstream repo
git clone https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills-
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills- "$T" && mkdir -p ~/.claude/skills && cp -r "$T/Skills/Transcriptomics/differential-expression/deseq2-basics" ~/.claude/skills/mdbabumiamssm-llms-universal-life-science-and-clinical-skills-deseq2-basics && rm -rf "$T"
manifest:
Skills/Transcriptomics/differential-expression/deseq2-basics/SKILL.mdsource content
<!--
# COPYRIGHT NOTICE
# This file is part of the "Universal Biomedical Skills" project.
# Copyright (c) 2026 MD BABU MIA, PhD <md.babu.mia@mssm.edu>
# All Rights Reserved.
#
# This code is proprietary and confidential.
# Unauthorized copying of this file, via any medium is strictly prohibited.
#
# Provenance: Authenticated by MD BABU MIA
-->
name: bio-de-deseq2-basics description: Perform differential expression analysis using DESeq2 in R/Bioconductor. Use for analyzing RNA-seq count data, creating DESeqDataSet objects, running the DESeq workflow, and extracting results with log fold change shrinkage. Use when performing DE analysis with DESeq2. tool_type: r primary_tool: DESeq2 measurable_outcome: Execute skill workflow successfully with valid output within 15 minutes. allowed-tools:
- read_file
- run_shell_command
DESeq2 Basics
Differential expression analysis using DESeq2 for RNA-seq count data.
Required Libraries
library(DESeq2) library(apeglm) # For lfcShrink with type='apeglm'
Installation
if (!require('BiocManager', quietly = TRUE)) install.packages('BiocManager') BiocManager::install('DESeq2') BiocManager::install('apeglm')
Creating DESeqDataSet
From Count Matrix
# counts: matrix with genes as rows, samples as columns # coldata: data frame with sample metadata (rownames must match colnames of counts) dds <- DESeqDataSetFromMatrix(countData = counts, colData = coldata, design = ~ condition)
From SummarizedExperiment
library(SummarizedExperiment) dds <- DESeqDataSet(se, design = ~ condition)
From tximport (Salmon/Kallisto)
library(tximport) txi <- tximport(files, type = 'salmon', tx2gene = tx2gene) dds <- DESeqDataSetFromTximport(txi, colData = coldata, design = ~ condition)
Standard DESeq2 Workflow
# Create DESeqDataSet dds <- DESeqDataSetFromMatrix(countData = counts, colData = coldata, design = ~ condition) # Pre-filter low count genes (recommended) keep <- rowSums(counts(dds)) >= 10 dds <- dds[keep,] # Set reference level for condition dds$condition <- relevel(dds$condition, ref = 'control') # Run DESeq2 pipeline (estimateSizeFactors, estimateDispersions, nbinomWaldTest) dds <- DESeq(dds) # Get results res <- results(dds) # Apply log fold change shrinkage (recommended for visualization/ranking) resLFC <- lfcShrink(dds, coef = 'condition_treated_vs_control', type = 'apeglm')
Design Formulas
# Simple two-group comparison design = ~ condition # Controlling for batch effects design = ~ batch + condition # Interaction model design = ~ genotype + treatment + genotype:treatment # Multi-factor without interaction design = ~ genotype + treatment
Specifying Contrasts
# See available coefficients resultsNames(dds) # Results by coefficient name res <- results(dds, name = 'condition_treated_vs_control') # Results by contrast (compare specific levels) res <- results(dds, contrast = c('condition', 'treated', 'control')) # Contrast with list format (for complex designs) res <- results(dds, contrast = list('conditionB', 'conditionA'))
Log Fold Change Shrinkage
# apeglm method (default, recommended) resLFC <- lfcShrink(dds, coef = 'condition_treated_vs_control', type = 'apeglm') # ashr method (alternative) resLFC <- lfcShrink(dds, coef = 'condition_treated_vs_control', type = 'ashr') # normal method (original, less recommended) resLFC <- lfcShrink(dds, coef = 'condition_treated_vs_control', type = 'normal')
Setting Significance Thresholds
# Default: padj < 0.1 res <- results(dds) # Custom alpha threshold res <- results(dds, alpha = 0.05) # With log fold change threshold res <- results(dds, lfcThreshold = 1) # |log2FC| > 1
Accessing DESeq2 Results
# Summary of results summary(res) # Get significant genes sig <- subset(res, padj < 0.05) # Order by adjusted p-value resOrdered <- res[order(res$padj),] # Order by log fold change resOrdered <- res[order(abs(res$log2FoldChange), decreasing = TRUE),] # Convert to data frame res_df <- as.data.frame(res)
Result Columns
| Column | Description |
|---|---|
| Mean of normalized counts across all samples |
| Log2 fold change (treatment vs control) |
| Standard error of log2 fold change |
| Wald statistic |
| Raw p-value |
| Adjusted p-value (Benjamini-Hochberg) |
Normalization and Counts
# Get normalized counts normalized_counts <- counts(dds, normalized = TRUE) # Get size factors sizeFactors(dds) # Variance stabilizing transformation (for visualization) vsd <- vst(dds, blind = FALSE) # Regularized log transformation (alternative, slower) rld <- rlog(dds, blind = FALSE)
Multi-Factor Designs
# Design with batch correction dds <- DESeqDataSetFromMatrix(countData = counts, colData = coldata, design = ~ batch + condition) dds <- DESeq(dds) # Extract condition effect (controlling for batch) res <- results(dds, name = 'condition_treated_vs_control')
Interaction Models
# Interaction between genotype and treatment dds <- DESeqDataSetFromMatrix(countData = counts, colData = coldata, design = ~ genotype + treatment + genotype:treatment) dds <- DESeq(dds) # Test interaction term res_interaction <- results(dds, name = 'genotypeKO.treatmentdrug') # Or use contrast for difference of differences res_interaction <- results(dds, contrast = list( c('genotypeKO.treatmentdrug'), c() ))
Likelihood Ratio Test
# Compare full vs reduced model dds <- DESeq(dds, test = 'LRT', reduced = ~ batch) # Results from LRT res <- results(dds)
Pre-Filtering Strategies
# Remove genes with low counts keep <- rowSums(counts(dds)) >= 10 dds <- dds[keep,] # Keep genes with at least n counts in at least k samples keep <- rowSums(counts(dds) >= 10) >= 3 dds <- dds[keep,] # Filter by expression level keep <- rowMeans(counts(dds, normalized = TRUE)) >= 10 dds <- dds[keep,]
Working with Existing Objects
# Update design formula design(dds) <- ~ batch + condition dds <- DESeq(dds) # Subset samples dds_subset <- dds[, dds$group == 'A'] # Subset genes dds_genes <- dds[rownames(dds) %in% gene_list,]
Exporting Results
# Write to CSV write.csv(as.data.frame(resOrdered), file = 'deseq2_results.csv') # Write normalized counts write.csv(as.data.frame(normalized_counts), file = 'normalized_counts.csv')
Common Errors
| Error | Cause | Solution |
|---|---|---|
| "design matrix not full rank" | Confounded variables or missing levels | Check coldata for confounding |
| "counts matrix should be integers" | Non-integer counts (e.g., from tximport) | Use DESeqDataSetFromTximport() |
| "all samples have 0 counts" | Gene filtering issue | Check count matrix format |
| "factor levels not in colData" | Typo in design formula | Verify column names in coldata |
Deprecated Features
| Feature | Status | Alternative |
|---|---|---|
| No-replicate designs | Removed (v1.22) | Require biological replicates |
| Deprecated | Use instead |
for large datasets | Not recommended | Use for >100 samples |
Quick Reference: Workflow Steps
# 1. Create DESeqDataSet dds <- DESeqDataSetFromMatrix(counts, coldata, design = ~ condition) # 2. Pre-filter keep <- rowSums(counts(dds)) >= 10 dds <- dds[keep,] # 3. Set reference level dds$condition <- relevel(dds$condition, ref = 'control') # 4. Run DESeq2 dds <- DESeq(dds) # 5. Get results with shrinkage res <- lfcShrink(dds, coef = resultsNames(dds)[2], type = 'apeglm') # 6. Filter significant genes sig_genes <- subset(res, padj < 0.05 & abs(log2FoldChange) > 1)
Related Skills
- edger-basics - Alternative DE analysis with edgeR
- de-visualization - MA plots, volcano plots, heatmaps
- de-results - Extract and export significant genes