LLMs-Universal-Life-Science-and-Clinical-Skills- bead-normalization

<!--

install

source · Clone the upstream repo

git clone https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills-

Claude Code · Install into ~/.claude/skills/

T=$(mktemp -d) && git clone --depth=1 https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills- "$T" && mkdir -p ~/.claude/skills && cp -r "$T/Skills/Hematology/Flow_Cytometry/bead-normalization" ~/.claude/skills/mdbabumiamssm-llms-universal-life-science-and-clinical-skills-bead-normalization && rm -rf "$T"

manifest: Skills/Hematology/Flow_Cytometry/bead-normalization/SKILL.md

source content

name: bio-flow-cytometry-bead-normalization description: Bead-based normalization for CyTOF and high-parameter flow cytometry. Covers EQ bead normalization, signal drift correction, and batch normalization. Use when correcting instrument drift in CyTOF or harmonizing data across batches. tool_type: r primary_tool: CATALYST measurable_outcome: Execute skill workflow successfully with valid output within 15 minutes. allowed-tools:

read_file
run_shell_command

Bead Normalization

CyTOF EQ Bead Normalization

library(CATALYST)
library(flowCore)

# CyTOF data typically includes EQ normalization beads
# Fluidigm provides normalizer software, but can also do in R

# Load FCS with beads
ff <- read.FCS('cytof_with_beads.fcs')

# EQ beads contain known amounts of: Ce140, Eu151, Eu153, Ho165, Lu175
bead_channels <- c('Ce140Di', 'Eu151Di', 'Eu153Di', 'Ho165Di', 'Lu175Di')

# Identify bead events (high signal in bead channels)
bead_data <- exprs(ff)[, bead_channels]
bead_scores <- rowMeans(scale(bead_data))

# Beads typically have very high intensity
bead_threshold <- quantile(bead_scores, 0.99)
is_bead <- bead_scores > bead_threshold

cat('Identified', sum(is_bead), 'bead events (', round(mean(is_bead) * 100, 2), '%)\n')

Calculate Normalization Factors

# For each acquisition, calculate median bead intensity
# Compare to reference to get normalization factor

calculate_norm_factors <- function(ff, bead_channels, bead_idx) {
    bead_intensities <- exprs(ff)[bead_idx, bead_channels]

    # Median intensity per channel
    medians <- apply(bead_intensities, 2, median)

    return(medians)
}

# Reference values (from first file or known standards)
reference_beads <- c(Ce140 = 500, Eu151 = 600, Eu153 = 550, Ho165 = 450, Lu175 = 400)

# Calculate factors
sample_beads <- calculate_norm_factors(ff, bead_channels, is_bead)
norm_factors <- reference_beads / sample_beads

cat('Normalization factors:\n')
print(round(norm_factors, 3))

Apply Normalization

# Apply normalization to all marker channels (not scatter)
marker_channels <- setdiff(colnames(ff), c('Time', 'Event_length', bead_channels))

normalize_cytof <- function(ff, norm_factors, channels) {
    # Get expression matrix
    expr <- exprs(ff)

    # Apply geometric mean of bead factors to all channels
    global_factor <- exp(mean(log(norm_factors)))

    # Or apply per-channel if you have channel-specific factors
    expr[, channels] <- expr[, channels] * global_factor

    exprs(ff) <- expr
    return(ff)
}

ff_normalized <- normalize_cytof(ff, norm_factors, marker_channels)

# Remove bead events
ff_clean <- ff_normalized[!is_bead, ]
cat('Final cell count:', nrow(ff_clean), '\n')

Time-Based Drift Correction

# Correct for signal drift over acquisition time

correct_drift <- function(ff, time_channel = 'Time') {
    expr <- exprs(ff)
    time <- expr[, time_channel]

    # Bin by time
    n_bins <- 20
    time_bins <- cut(time, breaks = n_bins, labels = FALSE)

    # For each marker, fit LOESS to bead signal over time
    corrected <- expr

    marker_cols <- setdiff(colnames(expr), c(time_channel, 'Event_length'))

    for (marker in marker_cols) {
        bin_medians <- tapply(expr[is_bead, marker], time_bins[is_bead], median)

        if (length(unique(time_bins[is_bead])) > 3) {
            # Fit smooth curve to drift
            drift_data <- data.frame(
                time = as.numeric(names(bin_medians)),
                intensity = as.numeric(bin_medians)
            )

            loess_fit <- loess(intensity ~ time, data = drift_data, span = 0.5)

            # Predict correction factor for all events
            correction <- predict(loess_fit, newdata = data.frame(time = time_bins))
            reference <- median(drift_data$intensity)

            corrected[, marker] <- expr[, marker] * (reference / correction)
        }
    }

    exprs(ff) <- corrected
    return(ff)
}

ff_drift_corrected <- correct_drift(ff)

Batch Normalization with CytoNorm

# CytoNorm for cross-batch normalization using reference samples

library(CytoNorm)

# Requires: training samples run on all batches (e.g., same PBMC reference)
# Creates spline-based transformation

# Prepare training data
train_files <- list.files('batch1_reference/', pattern = '\\.fcs$', full.names = TRUE)
train_data <- lapply(train_files, read.FCS)

# Define model
model <- CytoNorm.train(
    files = train_files,
    labels = rep('Reference', length(train_files)),
    channels = marker_channels,
    transformList = NULL,  # If already transformed
    nQ = 100,  # Number of quantiles
    seed = 42
)

# Apply to new batch
test_files <- list.files('batch2/', pattern = '\\.fcs$', full.names = TRUE)
normalized_files <- CytoNorm.normalize(
    model = model,
    files = test_files,
    labels = rep('Test', length(test_files)),
    outputDir = 'batch2_normalized/'
)

Quantile Normalization

# Simple quantile normalization across samples

quantile_normalize <- function(fs, channels) {
    # Extract expression matrices
    expr_list <- lapply(fs, function(ff) exprs(ff)[, channels])

    # Get reference distribution (mean of all samples)
    all_values <- do.call(rbind, expr_list)
    reference_quantiles <- apply(all_values, 2, function(x) sort(x))
    reference <- colMeans(reference_quantiles)

    # Normalize each sample
    normalized_fs <- fs
    for (i in 1:length(fs)) {
        expr <- exprs(fs[[i]])
        for (ch in channels) {
            ranks <- rank(expr[, ch], ties.method = 'average')
            normalized_values <- approx(1:length(reference), sort(reference),
                                        xout = ranks)$y
            expr[, ch] <- normalized_values
        }
        exprs(normalized_fs[[i]]) <- expr
    }

    return(normalized_fs)
}

CATALYST-Based Normalization

library(CATALYST)

# CATALYST provides bead-based normalization for CyTOF

# Load data with prepData (handles bead removal)
sce <- prepData(fs, panel, md,
                transform = TRUE,
                cofactor = 5,
                by_time = TRUE)  # Correct time-dependent drift

# Or manual bead gating in CATALYST
# sce <- prepData(fs, panel, md, FACS = FALSE)
# sce <- filterSCE(sce, !sce$is_bead)

Visualization

library(ggplot2)

# Plot bead signal over time
bead_plot_data <- data.frame(
    Time = exprs(ff)[is_bead, 'Time'],
    Ce140 = exprs(ff)[is_bead, 'Ce140Di'],
    Eu151 = exprs(ff)[is_bead, 'Eu151Di']
)

ggplot(bead_plot_data, aes(x = Time, y = Ce140)) +
    geom_point(alpha = 0.1, size = 0.5) +
    geom_smooth(method = 'loess', color = 'red') +
    theme_bw() +
    labs(title = 'Bead Signal Over Time (Ce140)', x = 'Time', y = 'Intensity')
ggsave('bead_drift.png', width = 10, height = 4)

# Before/after normalization
compare_df <- data.frame(
    Value = c(exprs(ff)[, 'CD45'], exprs(ff_normalized)[, 'CD45']),
    Status = rep(c('Before', 'After'), each = nrow(ff))
)

ggplot(compare_df, aes(x = Value, fill = Status)) +
    geom_histogram(bins = 100, alpha = 0.5, position = 'identity') +
    theme_bw() +
    labs(title = 'Normalization Effect on CD45')

Export Normalized Data

# Save normalized FCS files
write.FCS(ff_clean, 'normalized_sample.fcs')

# For CATALYST object
# saveRDS(sce, 'normalized_sce.rds')

Related Skills

Workflow order: cytometry-qc → doublet-detection → bead-normalization → clustering

cytometry-qc - Run first: identify drift and quality issues
doublet-detection - Run before: remove doublets prior to normalization
compensation-transformation - Initial data preprocessing
clustering-phenotyping - Analysis after normalization
differential-analysis - Batch-aware statistical testing