openproblems-bio · seohyonkim · Jun 4, 2025 · Jun 6, 2025
diff --git a/src/methods/unsupervised_scmerge2/config.vsh.yaml b/src/methods/unsupervised_scmerge2/config.vsh.yaml
@@ -0,0 +1,85 @@
+# The API specifies which type of component this is.
+# It contains specifications for:
+#   - The input/output files
+#   - Common parameters
+#   - A unit test
+__merge__: ../../api/comp_method.yaml
+
+# A unique identifier for your component (required).
+# Can contain only lowercase letters or underscores.
+name: unsupervised_scmerge2
+# A relatively short label, used when rendering visualisations (required)
+label: unsupervised Scmerge2
+# A one sentence summary of how this method works (required). Used when 
+# rendering summary tables.
+summary: "scMerge2 is an algorithm that integrates multiple single-cell RNA-seq datasets by leveraging factor analysis of stably expressed genes and pseudoreplication."
+# A multi-line description of how this component works (required). Used
+# when rendering reference documentation.
+description: |
+  scMerge works by integrating multiple single-cell RNA-seq datasets while correcting for batch effects and preserving biological signals.
+  It first identifies a set of stably expressed genes (SEGs) that are assumed to remain consistent across datasets.
+  Then, it uses a factor analysis model on these SEGs to estimate and remove unwanted variation.
+  To improve accuracy, scMerge creates pseudo-replicates which serve as anchors for alignment.
+  Finally, it corrects the data using these estimates, producing a harmonized expression matrix suitable for downstream analysis..
+references:
+  doi: 
+    - 10.1073/pnas.1820006116
+#   bibtex:
+#     - |
+#       @article{foo,
+#         title={Foo},
+#         author={Bar},
+#         journal={Baz},
+#         year={2024}
+#       }
+links:
+  # URL to the documentation for this method (required).
+  documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html
+  # URL to the code repository for this method (required).
+  repository: https://github.com/SydneyBioX/scMerge
+
+
+
+# Metadata for your component
+info:
+  # Which normalisation method this component prefers to use (required).
+  preferred_normalization: log_cpm
+
+# Component-specific parameters (optional)
+# arguments:
+#   - name: "--n_neighbors"
+#     type: "integer"
+#     default: 5
+#     description: Number of neighbors to use.
+
+# Resources required to run the component
+resources:
+  # The script of your component (required)
+  - type: r_script
+    path: script.R
+  # Additional resources your script needs (optional)
+  # - type: file
+  #   path: weights.pt
+
+engines:
+  # Specifications for the Docker image for this component.
+  - type: docker
+    image: openproblems/base_r:1.0.0
+    # Add custom dependencies here (optional). For more information, see
+    # https://viash.io/reference/config/engines/docker/#setup .
+    setup:
+      - type: apt
+        packages: cmake
+      - type: r
+        bioc: 
+        - scmerge
+        - org.Mm.eg.db
+        - org.Hs.eg.db
+
+runners:
+  # This platform allows running the component natively
+  - type: executable
+  # Allows turning the component into a Nextflow module / pipeline.
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/src/methods/unsupervised_scmerge2/script.R b/src/methods/unsupervised_scmerge2/script.R
@@ -0,0 +1,98 @@
+cat(">> Load dependencies\n")
+requireNamespace("anndata", quietly = TRUE)
+library(scMerge)
+library(org.Hs.eg.db)
+library(org.Mm.eg.db)
+
+## VIASH START
+par <- list(
+  input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
+  output = "output.h5ad"
+)
+meta <- list(
+  name = "unsupervised_scmerge2"
+)
+## VIASH END
+
+cat("Reading input files\n")
+adata <- anndata::read_h5ad(par$input)
+adata$obs["batch"] <- sub("\\+", "plus", adata$obs[["batch"]]) # Replace "+"" characters in batch names
+
+anndataToScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = FALSE) {
+  exprsMat_all <- t(as.matrix(adata$layers[[layer]]))
+  batch_all <- as.character(adata$obs$batch)
+
+  valid_cells <- !is.na(batch_all)
+  exprsMat <- exprsMat_all[, valid_cells, drop = FALSE]
+  batch <- batch_all[valid_cells]
+
+  # Check overlap with human/mouse scSEG lists
+  gene_ids <- rownames(exprsMat)
+  species <- NULL
+  best_match <- 0
+
+  for (organism in names(seg_list)) {
+    scseg_name <- paste0(organism, "_scSEG")
+    seg_genes <- seg_list[[organism]][[scseg_name]]
+    overlap <- length(intersect(gene_ids, seg_genes))
+
+    if (overlap > best_match) {
+      best_match <- overlap
+      species <- organism
+    }
+  }
+
+  if (is.null(species) || best_match == 0) {
+    stop("No match found between gene IDs in exprsMat and scSEG lists for human or mouse. ",
+         "Please ensure you're using Ensembl IDs for human or mouse, or provide a custom SEG list.")
+  }
+
+  message("Detected species: ", species, " (matched ", best_match, " genes)")
+
+  ctl <- seg_list[[species]][[paste0(species, "_scSEG")]]
+
+  scMerge2_res <- scMerge2(
+    exprsMat = exprsMat,
+    batch = batch,
+    ctl = ctl,
+    verbose = verbose
+  )
+
+  return(scMerge2_res)
+}
+
+data("segList_ensemblGeneID")
+
+cat("Run scMerge2\n")
+
+scMerge2_res <- anndataToScMerge2(
+  adata = adata,
+  seg_list = segList_ensemblGeneID,
+  layer = "normalized",
+  verbose = TRUE
+)
+
+
+cat("Store output\n")
+corrected_mat <- scMerge2_res$newY
+
+embedding <- prcomp(t(corrected_mat))$x[, 1:10]
+
+rownames(embedding) <- colnames(corrected_mat)
+
+output <- anndata::AnnData(
+  X = NULL,
+  obs = adata$obs[, c()],
+  var = NULL,
+  obsm = list(
+    X_emb = embedding[rownames(adata), , drop = FALSE]  # match input cells
+  ),
+  uns = list(
+    dataset_id = adata$uns[["dataset_id"]],
+    normalization_id = adata$uns[["normalization_id"]],
+    method_id = meta$name 
+  ),
+  shape = adata$shape
+)
+cat("Write output AnnData to file\n")
+output$write_h5ad(par[["output"]], compression = "gzip")