LLMs-Universal-Life-Science-and-Clinical-Skills- data-harmonization

<!--

install
source · Clone the upstream repo
git clone https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills-
Claude Code · Install into ~/.claude/skills/
T=$(mktemp -d) && git clone --depth=1 https://github.com/mdbabumiamssm/LLMs-Universal-Life-Science-and-Clinical-Skills- "$T" && mkdir -p ~/.claude/skills && cp -r "$T/Skills/Multi_Omics/multi-omics-integration/data-harmonization" ~/.claude/skills/mdbabumiamssm-llms-universal-life-science-and-clinical-skills-data-harmonization && rm -rf "$T"
manifest: Skills/Multi_Omics/multi-omics-integration/data-harmonization/SKILL.md
source content
<!-- # COPYRIGHT NOTICE # This file is part of the "Universal Biomedical Skills" project. # Copyright (c) 2026 MD BABU MIA, PhD <md.babu.mia@mssm.edu> # All Rights Reserved. # # This code is proprietary and confidential. # Unauthorized copying of this file, via any medium is strictly prohibited. # # Provenance: Authenticated by MD BABU MIA -->

name: bio-multi-omics-data-harmonization description: Preprocessing and harmonization of multi-omics data before integration. Covers normalization, batch correction, feature alignment, and missing value handling across data types. Use when preparing multi-omics datasets for integration analysis. tool_type: r primary_tool: MultiAssayExperiment measurable_outcome: Execute skill workflow successfully with valid output within 15 minutes. allowed-tools:

  • read_file
  • run_shell_command

Data Harmonization for Multi-Omics

MultiAssayExperiment Structure

library(MultiAssayExperiment)

# Load individual assays
rna <- SummarizedExperiment(assays = list(counts = rna_matrix), colData = sample_info)
protein <- SummarizedExperiment(assays = list(intensity = protein_matrix), colData = sample_info)
methylation <- SummarizedExperiment(assays = list(beta = meth_matrix), colData = sample_info)

# Create experiment list
exp_list <- ExperimentList(RNA = rna, Protein = protein, Methylation = methylation)

# Sample map (links samples to assays)
smap <- data.frame(
    assay = rep(c('RNA', 'Protein', 'Methylation'), each = nrow(sample_info)),
    primary = rep(sample_info$SampleID, 3),
    colname = c(colnames(rna_matrix), colnames(protein_matrix), colnames(meth_matrix))
)

# Create MAE
mae <- MultiAssayExperiment(experiments = exp_list, colData = sample_info, sampleMap = smap)

Normalization Per Assay

# RNA-seq: VST normalization
library(DESeq2)
dds <- DESeqDataSetFromMatrix(countData = assay(mae, 'RNA'),
                               colData = colData(mae),
                               design = ~ 1)
vst_rna <- assay(vst(dds))

# Proteomics: Log2 + median centering
log2_protein <- log2(assay(mae, 'Protein'))
log2_protein[is.infinite(log2_protein)] <- NA
medians <- apply(log2_protein, 2, median, na.rm = TRUE)
norm_protein <- sweep(log2_protein, 2, medians - median(medians))

# Methylation: M-value transformation
beta <- assay(mae, 'Methylation')
m_values <- log2(beta / (1 - beta))

Cross-Omics Batch Correction

library(sva)

# Combine normalized matrices for joint batch correction
# Only use common samples
common_samples <- Reduce(intersect, colnames(mae))

combined <- rbind(
    vst_rna[, common_samples],
    norm_protein[, common_samples],
    m_values[, common_samples]
)

# Add omics type as covariate
omics_type <- c(rep('RNA', nrow(vst_rna)),
                rep('Protein', nrow(norm_protein)),
                rep('Methylation', nrow(m_values)))

# ComBat for batch correction
batch <- colData(mae)[common_samples, 'Batch']
mod <- model.matrix(~ Condition, data = colData(mae)[common_samples, ])

corrected <- ComBat(dat = combined, batch = batch, mod = mod)

# Split back into separate matrices
idx_rna <- 1:nrow(vst_rna)
idx_prot <- (nrow(vst_rna) + 1):(nrow(vst_rna) + nrow(norm_protein))
idx_meth <- (nrow(vst_rna) + nrow(norm_protein) + 1):nrow(combined)

corrected_rna <- corrected[idx_rna, ]
corrected_protein <- corrected[idx_prot, ]
corrected_meth <- corrected[idx_meth, ]

Feature Alignment (Gene-Level)

library(biomaRt)

# Map protein IDs to gene symbols
ensembl <- useEnsembl(biomart = 'genes', dataset = 'hsapiens_gene_ensembl')

# Protein to gene mapping
protein_ids <- rownames(norm_protein)
protein_mapping <- getBM(attributes = c('uniprotswissprot', 'hgnc_symbol'),
                          filters = 'uniprotswissprot',
                          values = protein_ids,
                          mart = ensembl)

# Aggregate proteins to gene level (mean)
protein_gene <- norm_protein
rownames(protein_gene) <- protein_mapping$hgnc_symbol[match(rownames(protein_gene), protein_mapping$uniprotswissprot)]
protein_gene <- protein_gene[!is.na(rownames(protein_gene)), ]
protein_gene <- aggregate(. ~ rownames(protein_gene), data = as.data.frame(protein_gene), FUN = mean)

# Map methylation probes to genes
# (requires annotation package, e.g., IlluminaHumanMethylation450kanno.ilmn12.hg19)
library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
anno <- getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)
probe_genes <- anno[rownames(m_values), 'UCSC_RefGene_Name']

Missing Value Handling

# Per-assay missing value analysis
missing_summary <- function(mat) {
    data.frame(
        total_missing = sum(is.na(mat)),
        pct_missing = mean(is.na(mat)) * 100,
        samples_complete = sum(colSums(is.na(mat)) == 0),
        features_complete = sum(rowSums(is.na(mat)) == 0)
    )
}

lapply(list(RNA = vst_rna, Protein = norm_protein, Methylation = m_values), missing_summary)

# Filter features with too many missing values
filter_missing <- function(mat, max_missing_pct = 50) {
    keep <- rowMeans(is.na(mat)) * 100 < max_missing_pct
    mat[keep, ]
}

protein_filtered <- filter_missing(norm_protein, max_missing_pct = 30)

# Imputation (MinProb for proteomics)
impute_minprob <- function(mat) {
    for (i in 1:ncol(mat)) {
        nas <- is.na(mat[, i])
        if (any(nas)) {
            q01 <- quantile(mat[, i], 0.01, na.rm = TRUE)
            mat[nas, i] <- rnorm(sum(nas), mean = q01, sd = abs(q01) * 0.1)
        }
    }
    mat
}

protein_imputed <- impute_minprob(protein_filtered)

Sample Matching and Subsetting

# Find complete samples across all assays
complete_samples <- intersectColumns(mae)
cat('Samples in all assays:', ncol(complete_samples), '\n')

# Subset to common samples
mae_matched <- mae[, complete_samples, ]

# Alternative: keep samples with N-1 assays
subsetByColData(mae, mae$has_at_least_2_assays)

Scale and Center

# Z-score transformation (per feature)
scale_matrix <- function(mat) {
    t(scale(t(mat)))
}

scaled_rna <- scale_matrix(vst_rna)
scaled_protein <- scale_matrix(norm_protein)
scaled_meth <- scale_matrix(m_values)

# Verify scaling
cat('RNA mean:', mean(scaled_rna, na.rm = TRUE), 'sd:', sd(scaled_rna, na.rm = TRUE), '\n')

Export Harmonized Data

# Save as list for integration tools
harmonized <- list(
    RNA = scaled_rna,
    Protein = scaled_protein,
    Methylation = scaled_meth,
    sample_info = colData(mae)[common_samples, ]
)

saveRDS(harmonized, 'harmonized_multiomics.rds')

# Or as separate CSVs
write.csv(scaled_rna, 'harmonized_rna.csv')
write.csv(scaled_protein, 'harmonized_protein.csv')
write.csv(scaled_meth, 'harmonized_methylation.csv')

Related Skills

  • mofa-integration - Use harmonized data in MOFA2
  • mixomics-analysis - Use harmonized data in mixOmics
  • differential-expression/batch-correction - RNA-seq batch correction
  • proteomics/proteomics-qc - Proteomics-specific QC
<!-- AUTHOR_SIGNATURE: 9a7f3c2e-MD-BABU-MIA-2026-MSSM-SECURE -->