diff --git a/src/methods/unsupervised_scmerge2/config.vsh.yaml b/src/methods/unsupervised_scmerge2/config.vsh.yaml new file mode 100644 index 00000000..bd732295 --- /dev/null +++ b/src/methods/unsupervised_scmerge2/config.vsh.yaml @@ -0,0 +1,85 @@ +# The API specifies which type of component this is. +# It contains specifications for: +# - The input/output files +# - Common parameters +# - A unit test +__merge__: ../../api/comp_method.yaml + +# A unique identifier for your component (required). +# Can contain only lowercase letters or underscores. +name: unsupervised_scmerge2 +# A relatively short label, used when rendering visualisations (required) +label: unsupervised Scmerge2 +# A one sentence summary of how this method works (required). Used when +# rendering summary tables. +summary: "scMerge2 is an algorithm that integrates multiple single-cell RNA-seq datasets by leveraging factor analysis of stably expressed genes and pseudoreplication." +# A multi-line description of how this component works (required). Used +# when rendering reference documentation. +description: | + scMerge works by integrating multiple single-cell RNA-seq datasets while correcting for batch effects and preserving biological signals. + It first identifies a set of stably expressed genes (SEGs) that are assumed to remain consistent across datasets. + Then, it uses a factor analysis model on these SEGs to estimate and remove unwanted variation. + To improve accuracy, scMerge creates pseudo-replicates which serve as anchors for alignment. + Finally, it corrects the data using these estimates, producing a harmonized expression matrix suitable for downstream analysis.. +references: + doi: + - 10.1073/pnas.1820006116 +# bibtex: +# - | +# @article{foo, +# title={Foo}, +# author={Bar}, +# journal={Baz}, +# year={2024} +# } +links: + # URL to the documentation for this method (required). + documentation: https://sydneybiox.github.io/scMerge/articles/scMerge2.html + # URL to the code repository for this method (required). + repository: https://github.com/SydneyBioX/scMerge + + + +# Metadata for your component +info: + # Which normalisation method this component prefers to use (required). + preferred_normalization: log_cpm + +# Component-specific parameters (optional) +# arguments: +# - name: "--n_neighbors" +# type: "integer" +# default: 5 +# description: Number of neighbors to use. + +# Resources required to run the component +resources: + # The script of your component (required) + - type: r_script + path: script.R + # Additional resources your script needs (optional) + # - type: file + # path: weights.pt + +engines: + # Specifications for the Docker image for this component. + - type: docker + image: openproblems/base_r:1.0.0 + # Add custom dependencies here (optional). For more information, see + # https://viash.io/reference/config/engines/docker/#setup . + setup: + - type: apt + packages: cmake + - type: r + bioc: + - scmerge + - org.Mm.eg.db + - org.Hs.eg.db + +runners: + # This platform allows running the component natively + - type: executable + # Allows turning the component into a Nextflow module / pipeline. + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/methods/unsupervised_scmerge2/script.R b/src/methods/unsupervised_scmerge2/script.R new file mode 100644 index 00000000..3c766444 --- /dev/null +++ b/src/methods/unsupervised_scmerge2/script.R @@ -0,0 +1,98 @@ +cat(">> Load dependencies\n") +requireNamespace("anndata", quietly = TRUE) +library(scMerge) +library(org.Hs.eg.db) +library(org.Mm.eg.db) + +## VIASH START +par <- list( + input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", + output = "output.h5ad" +) +meta <- list( + name = "unsupervised_scmerge2" +) +## VIASH END + +cat("Reading input files\n") +adata <- anndata::read_h5ad(par$input) +adata$obs["batch"] <- sub("\\+", "plus", adata$obs[["batch"]]) # Replace "+"" characters in batch names + +anndataToScMerge2 <- function(adata, seg_list, layer = "normalized", verbose = FALSE) { + exprsMat_all <- t(as.matrix(adata$layers[[layer]])) + batch_all <- as.character(adata$obs$batch) + + valid_cells <- !is.na(batch_all) + exprsMat <- exprsMat_all[, valid_cells, drop = FALSE] + batch <- batch_all[valid_cells] + + # Check overlap with human/mouse scSEG lists + gene_ids <- rownames(exprsMat) + species <- NULL + best_match <- 0 + + for (organism in names(seg_list)) { + scseg_name <- paste0(organism, "_scSEG") + seg_genes <- seg_list[[organism]][[scseg_name]] + overlap <- length(intersect(gene_ids, seg_genes)) + + if (overlap > best_match) { + best_match <- overlap + species <- organism + } + } + + if (is.null(species) || best_match == 0) { + stop("No match found between gene IDs in exprsMat and scSEG lists for human or mouse. ", + "Please ensure you're using Ensembl IDs for human or mouse, or provide a custom SEG list.") + } + + message("Detected species: ", species, " (matched ", best_match, " genes)") + + ctl <- seg_list[[species]][[paste0(species, "_scSEG")]] + + scMerge2_res <- scMerge2( + exprsMat = exprsMat, + batch = batch, + ctl = ctl, + verbose = verbose + ) + + return(scMerge2_res) +} + +data("segList_ensemblGeneID") + +cat("Run scMerge2\n") + +scMerge2_res <- anndataToScMerge2( + adata = adata, + seg_list = segList_ensemblGeneID, + layer = "normalized", + verbose = TRUE +) + + +cat("Store output\n") +corrected_mat <- scMerge2_res$newY + +embedding <- prcomp(t(corrected_mat))$x[, 1:10] + +rownames(embedding) <- colnames(corrected_mat) + +output <- anndata::AnnData( + X = NULL, + obs = adata$obs[, c()], + var = NULL, + obsm = list( + X_emb = embedding[rownames(adata), , drop = FALSE] # match input cells + ), + uns = list( + dataset_id = adata$uns[["dataset_id"]], + normalization_id = adata$uns[["normalization_id"]], + method_id = meta$name + ), + shape = adata$shape +) +cat("Write output AnnData to file\n") +output$write_h5ad(par[["output"]], compression = "gzip")