LLMs-Universal-Life-Science-and-Clinical-Skills- deseq2-basics

<!--

install
source · Clone the upstream repo
git clone https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills-
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills- "$T" && mkdir -p ~/.claude/skills && cp -r "$T/Skills/Transcriptomics/differential-expression/deseq2-basics" ~/.claude/skills/mdbabumiamssm-llms-universal-life-science-and-clinical-skills-deseq2-basics && rm -rf "$T"
manifest: Skills/Transcriptomics/differential-expression/deseq2-basics/SKILL.md
source content
<!-- # COPYRIGHT NOTICE # This file is part of the "Universal Biomedical Skills" project. # Copyright (c) 2026 MD BABU MIA, PhD <md.babu.mia@mssm.edu> # All Rights Reserved. # # This code is proprietary and confidential. # Unauthorized copying of this file, via any medium is strictly prohibited. # # Provenance: Authenticated by MD BABU MIA -->

name: bio-de-deseq2-basics description: Perform differential expression analysis using DESeq2 in R/Bioconductor. Use for analyzing RNA-seq count data, creating DESeqDataSet objects, running the DESeq workflow, and extracting results with log fold change shrinkage. Use when performing DE analysis with DESeq2. tool_type: r primary_tool: DESeq2 measurable_outcome: Execute skill workflow successfully with valid output within 15 minutes. allowed-tools:

  • read_file
  • run_shell_command

DESeq2 Basics

Differential expression analysis using DESeq2 for RNA-seq count data.

Required Libraries

library(DESeq2)
library(apeglm)  # For lfcShrink with type='apeglm'

Installation

if (!require('BiocManager', quietly = TRUE))
    install.packages('BiocManager')
BiocManager::install('DESeq2')
BiocManager::install('apeglm')

Creating DESeqDataSet

From Count Matrix

# counts: matrix with genes as rows, samples as columns
# coldata: data frame with sample metadata (rownames must match colnames of counts)
dds <- DESeqDataSetFromMatrix(countData = counts,
                               colData = coldata,
                               design = ~ condition)

From SummarizedExperiment

library(SummarizedExperiment)
dds <- DESeqDataSet(se, design = ~ condition)

From tximport (Salmon/Kallisto)

library(tximport)
txi <- tximport(files, type = 'salmon', tx2gene = tx2gene)
dds <- DESeqDataSetFromTximport(txi, colData = coldata, design = ~ condition)

Standard DESeq2 Workflow

# Create DESeqDataSet
dds <- DESeqDataSetFromMatrix(countData = counts,
                               colData = coldata,
                               design = ~ condition)

# Pre-filter low count genes (recommended)
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

# Set reference level for condition
dds$condition <- relevel(dds$condition, ref = 'control')

# Run DESeq2 pipeline (estimateSizeFactors, estimateDispersions, nbinomWaldTest)
dds <- DESeq(dds)

# Get results
res <- results(dds)

# Apply log fold change shrinkage (recommended for visualization/ranking)
resLFC <- lfcShrink(dds, coef = 'condition_treated_vs_control', type = 'apeglm')

Design Formulas

# Simple two-group comparison
design = ~ condition

# Controlling for batch effects
design = ~ batch + condition

# Interaction model
design = ~ genotype + treatment + genotype:treatment

# Multi-factor without interaction
design = ~ genotype + treatment

Specifying Contrasts

# See available coefficients
resultsNames(dds)

# Results by coefficient name
res <- results(dds, name = 'condition_treated_vs_control')

# Results by contrast (compare specific levels)
res <- results(dds, contrast = c('condition', 'treated', 'control'))

# Contrast with list format (for complex designs)
res <- results(dds, contrast = list('conditionB', 'conditionA'))

Log Fold Change Shrinkage

# apeglm method (default, recommended)
resLFC <- lfcShrink(dds, coef = 'condition_treated_vs_control', type = 'apeglm')

# ashr method (alternative)
resLFC <- lfcShrink(dds, coef = 'condition_treated_vs_control', type = 'ashr')

# normal method (original, less recommended)
resLFC <- lfcShrink(dds, coef = 'condition_treated_vs_control', type = 'normal')

Setting Significance Thresholds

# Default: padj < 0.1
res <- results(dds)

# Custom alpha threshold
res <- results(dds, alpha = 0.05)

# With log fold change threshold
res <- results(dds, lfcThreshold = 1)  # |log2FC| > 1

Accessing DESeq2 Results

# Summary of results
summary(res)

# Get significant genes
sig <- subset(res, padj < 0.05)

# Order by adjusted p-value
resOrdered <- res[order(res$padj),]

# Order by log fold change
resOrdered <- res[order(abs(res$log2FoldChange), decreasing = TRUE),]

# Convert to data frame
res_df <- as.data.frame(res)

Result Columns

ColumnDescription
baseMean
Mean of normalized counts across all samples
log2FoldChange
Log2 fold change (treatment vs control)
lfcSE
Standard error of log2 fold change
stat
Wald statistic
pvalue
Raw p-value
padj
Adjusted p-value (Benjamini-Hochberg)

Normalization and Counts

# Get normalized counts
normalized_counts <- counts(dds, normalized = TRUE)

# Get size factors
sizeFactors(dds)

# Variance stabilizing transformation (for visualization)
vsd <- vst(dds, blind = FALSE)

# Regularized log transformation (alternative, slower)
rld <- rlog(dds, blind = FALSE)

Multi-Factor Designs

# Design with batch correction
dds <- DESeqDataSetFromMatrix(countData = counts,
                               colData = coldata,
                               design = ~ batch + condition)
dds <- DESeq(dds)

# Extract condition effect (controlling for batch)
res <- results(dds, name = 'condition_treated_vs_control')

Interaction Models

# Interaction between genotype and treatment
dds <- DESeqDataSetFromMatrix(countData = counts,
                               colData = coldata,
                               design = ~ genotype + treatment + genotype:treatment)
dds <- DESeq(dds)

# Test interaction term
res_interaction <- results(dds, name = 'genotypeKO.treatmentdrug')

# Or use contrast for difference of differences
res_interaction <- results(dds, contrast = list(
    c('genotypeKO.treatmentdrug'),
    c()
))

Likelihood Ratio Test

# Compare full vs reduced model
dds <- DESeq(dds, test = 'LRT', reduced = ~ batch)

# Results from LRT
res <- results(dds)

Pre-Filtering Strategies

# Remove genes with low counts
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

# Keep genes with at least n counts in at least k samples
keep <- rowSums(counts(dds) >= 10) >= 3
dds <- dds[keep,]

# Filter by expression level
keep <- rowMeans(counts(dds, normalized = TRUE)) >= 10
dds <- dds[keep,]

Working with Existing Objects

# Update design formula
design(dds) <- ~ batch + condition
dds <- DESeq(dds)

# Subset samples
dds_subset <- dds[, dds$group == 'A']

# Subset genes
dds_genes <- dds[rownames(dds) %in% gene_list,]

Exporting Results

# Write to CSV
write.csv(as.data.frame(resOrdered), file = 'deseq2_results.csv')

# Write normalized counts
write.csv(as.data.frame(normalized_counts), file = 'normalized_counts.csv')

Common Errors

ErrorCauseSolution
"design matrix not full rank"Confounded variables or missing levelsCheck coldata for confounding
"counts matrix should be integers"Non-integer counts (e.g., from tximport)Use DESeqDataSetFromTximport()
"all samples have 0 counts"Gene filtering issueCheck count matrix format
"factor levels not in colData"Typo in design formulaVerify column names in coldata

Deprecated Features

FeatureStatusAlternative
No-replicate designsRemoved (v1.22)Require biological replicates
betaPrior = TRUE
DeprecatedUse
lfcShrink()
instead
rlog()
for large datasets
Not recommendedUse
vst()
for >100 samples

Quick Reference: Workflow Steps

# 1. Create DESeqDataSet
dds <- DESeqDataSetFromMatrix(counts, coldata, design = ~ condition)

# 2. Pre-filter
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

# 3. Set reference level
dds$condition <- relevel(dds$condition, ref = 'control')

# 4. Run DESeq2
dds <- DESeq(dds)

# 5. Get results with shrinkage
res <- lfcShrink(dds, coef = resultsNames(dds)[2], type = 'apeglm')

# 6. Filter significant genes
sig_genes <- subset(res, padj < 0.05 & abs(log2FoldChange) > 1)

Related Skills

  • edger-basics - Alternative DE analysis with edgeR
  • de-visualization - MA plots, volcano plots, heatmaps
  • de-results - Extract and export significant genes
<!-- AUTHOR_SIGNATURE: 9a7f3c2e-MD-BABU-MIA-2026-MSSM-SECURE -->