From f47eff90560a78402ebbc3ce00c857051fe2cfa1 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 2 Oct 2024 11:31:29 +0200 Subject: [PATCH 01/23] Add cxg_immune_cell_atlas as a test resource --- _viash.yaml | 11 +++++---- scripts/create_resources/test_resources.sh | 26 ++++++++++++++++++++++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/_viash.yaml b/_viash.yaml index 8a0d18ea..d7ef3700 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -31,21 +31,24 @@ description: | references: doi: - # Luecken, M.D., Büttner, M., Chaichoompu, K. et al. - # Benchmarking atlas-level data integration in single-cell genomics. Nat Methods 19, 41–50 (2022). + # Luecken, M.D., Büttner, M., Chaichoompu, K. et al. + # Benchmarking atlas-level data integration in single-cell genomics. Nat Methods 19, 41–50 (2022). - 10.1038/s41592-021-01336-8 - + info: image: thumbnail.svg test_resources: - type: s3 path: s3://openproblems-data/resources_test/common/cxg_mouse_pancreas_atlas/ dest: resources_test/common/cxg_mouse_pancreas_atlas + - type: s3 + path: s3://openproblems-data/resources_test/common/cxg_immune_cell_atlas/ + dest: resources_test/common/cxg_immune_cell_atlas - type: s3 path: s3://openproblems-data/resources_test/task_batch_integration/ dest: resources_test/task_batch_integration -authors: +authors: - name: Michaela Mueller roles: [ maintainer, author ] info: diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index 92694692..f5369253 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -19,11 +19,20 @@ viash run src/data_processors/process_dataset/config.vsh.yaml -- \ --output_dataset "$DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad" \ --output_solution "$DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad" +viash run src/data_processors/process_dataset/config.vsh.yaml -- \ + --input "$RAW_DATA/cxg_immune_cell_atlas/dataset.h5ad" \ + --output_dataset "$DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad" \ + --output_solution "$DATASET_DIR/cxg_immune_cell_atlas/solution.h5ad" + # run one method viash run src/methods/combat/config.vsh.yaml -- \ --input $DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad \ --output $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated.h5ad +viash run src/methods/combat/config.vsh.yaml -- \ + --input $DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad \ + --output $DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad + # run transformer viash run src/data_processors/transform/config.vsh.yaml -- \ --input_integrated $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated.h5ad \ @@ -31,12 +40,23 @@ viash run src/data_processors/transform/config.vsh.yaml -- \ --expected_method_types feature \ --output $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated_full.h5ad +viash run src/data_processors/transform/config.vsh.yaml -- \ + --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad \ + --input_dataset $DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad \ + --expected_method_types feature \ + --output $DATASET_DIR/cxg_immune_cell_atlas/integrated_full.h5ad + # run one metric viash run src/metrics/graph_connectivity/config.vsh.yaml -- \ --input_integrated $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated_full.h5ad \ --input_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad \ --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad +viash run src/metrics/graph_connectivity/config.vsh.yaml -- \ + --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated_full.h5ad \ + --input_solution $DATASET_DIR/cxg_immune_cell_atlas/solution.h5ad \ + --output $DATASET_DIR/cxg_immune_cell_atlas/score.h5ad + # write the state file cat > $DATASET_DIR/state.yaml << HERE id: cxg_mouse_pancreas_atlas @@ -45,6 +65,12 @@ output_solution: !file solution.h5ad output_integrated: !file integrated.h5ad output_integrated_full: !file integrated_full.h5ad output_score: !file score.h5ad +id: cxg_immune_cell_atlas +output_dataset: !file dataset_mod1.h5ad +output_solution: !file solution_mod1.h5ad +output_integrated: !file integrated_mod1.h5ad +output_integrated_full: !file integrated_full_mod1.h5ad +output_score: !file score_mod1.h5ad HERE # only run this if you have access to the openproblems-data bucket From 175a3f82d38e58e3bc7a0e82c744b32e6fad8ab8 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 2 Oct 2024 11:32:02 +0200 Subject: [PATCH 02/23] Add SCimilarity component --- src/methods/scimilarity/config.vsh.yaml | 41 ++++++++++++ src/methods/scimilarity/script.py | 84 +++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 src/methods/scimilarity/config.vsh.yaml create mode 100644 src/methods/scimilarity/script.py diff --git a/src/methods/scimilarity/config.vsh.yaml b/src/methods/scimilarity/config.vsh.yaml new file mode 100644 index 00000000..88258dc6 --- /dev/null +++ b/src/methods/scimilarity/config.vsh.yaml @@ -0,0 +1,41 @@ +__merge__: /src/api/comp_method.yaml +name: scimilarity +label: SCimilarity +summary: SCimilarity provides unifying representation of single cell expression profiles +description: | + SCimilarity is a unifying representation of single cell expression profiles that quantifies similarity between expression states and generalizes to represent new studies without additional training +references: + doi: 10.1101/2023.07.18.549537 +links: + repository: https://github.com/Genentech/scimilarity + documentation: https://genentech.github.io/scimilarity/index.html +info: + method_types: [embedding] + preferred_normalization: counts +arguments: + - name: --model + type: file + description: Path to the directory containing SCimilarity models + required: true +resources: + - type: python_script + path: script.py + - path: /src/utils/read_anndata_partial.py +test_resources: + - type: python_script + path: /common/component_tests/check_config.py + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas + dest: resources_test/task_batch_integration/cxg_immune_cell_atlas +engines: + - type: docker + image: openproblems/base_pytorch_nvidia:1.0.0 + setup: + - type: python + github: Genentech/scimilarity +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/methods/scimilarity/script.py b/src/methods/scimilarity/script.py new file mode 100644 index 00000000..050700e6 --- /dev/null +++ b/src/methods/scimilarity/script.py @@ -0,0 +1,84 @@ +import sys +import anndata as ad +import scimilarity + +## VIASH START +par = { + "input": "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad", + "output": "output.h5ad", + "model": "model_v1.1", +} +meta = { + "name": "scvi", +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + +print("Read input", flush=True) +adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns") + +if adata.uns["dataset_organism"] != "homo_sapiens": + raise ValueError( + f"SCimilarity can only be used with human data " + f"(dataset_organism == \"{adata.uns['dataset_organism']}\")" + ) + +print("Load SCimilarity model", flush=True) +scimilarity_embedding = scimilarity.cell_embedding.CellEmbedding( + model_path=par["model"] +) +print("SCimilarity version:", scimilarity.__version__) + +print("Create input data", flush=True) +# Some of the functions modify the adata so make sure we have a copy +input = ad.AnnData(X=adata.X.copy(), layers={"counts": adata.X.copy()}) +# Set input.var_names to gene symbols +input.var_names = adata.var["feature_name"] + +print("Align datasets", flush=True) + +# Check the number of genes in the dataset and reduce the overlap threshold if +# necessary (mostly for subsampled test datasets) +gene_overlap_threshold = 5000 +if 0.8 * input.n_vars < gene_overlap_threshold: + from warnings import warn + + warn( + f"The number of genes in the dataset ({input.n_vars}) " + f"is less than or close to {gene_overlap_threshold}. " + f"Setting gene_overlap_threshold to 0.8 * n_var ({int(0.8 * input.n_vars)})." + ) + gene_overlap_threshold = int(0.8 * input.n_vars) + +input = scimilarity.utils.align_dataset( + input, + scimilarity_embedding.gene_order, + gene_overlap_threshold=gene_overlap_threshold, +) +input = scimilarity.utils.consolidate_duplicate_symbols(input) + +print("Normalizing dataset", flush=True) +input = scimilarity.utils.lognorm_counts(input) + +print("Get cell embeddings", flush=True) +cell_embeddings = scimilarity_embedding.get_embeddings(input.X) + +print("Store outputs", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + obsm={ + "X_emb": cell_embeddings, + }, + uns={ + "dataset_id": adata.uns["dataset_id"], + "normalization_id": adata.uns["normalization_id"], + "method_id": meta["name"], + }, +) +print(output) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") From a9931e197bee0b33ceca038a9b4c46e2b71be4c4 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 2 Oct 2024 11:32:53 +0200 Subject: [PATCH 03/23] Add SCimiliarity to benchmark workflow --- src/workflows/run_benchmark/main.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index ff77ad8d..a68002c6 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -26,6 +26,7 @@ methods = [ scalex, scanorama, scanvi, + scimilarity, scvi ] @@ -55,7 +56,7 @@ workflow run_wf { ****************************/ dataset_ch = input_ch // store join id - | map{ id, state -> + | map{ id, state -> [id, state + ["_meta": [join_id: id]]] } @@ -153,7 +154,7 @@ workflow run_wf { }, // use 'fromState' to fetch the arguments the component requires from the overall state fromState: [ - input_solution: "input_solution", + input_solution: "input_solution", input_integrated: "method_output_cleaned" ], // use 'toState' to publish that component's outputs to the overall state From 84394c37a4c0cad696a5c338dc0e393db454ec56 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 8 Oct 2024 14:49:29 +0200 Subject: [PATCH 04/23] Update script to extract model --- src/methods/scimilarity/config.vsh.yaml | 2 +- src/methods/scimilarity/script.py | 29 ++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/methods/scimilarity/config.vsh.yaml b/src/methods/scimilarity/config.vsh.yaml index 88258dc6..9c429a6e 100644 --- a/src/methods/scimilarity/config.vsh.yaml +++ b/src/methods/scimilarity/config.vsh.yaml @@ -15,7 +15,7 @@ info: arguments: - name: --model type: file - description: Path to the directory containing SCimilarity models + description: Path to the directory containing SCimilarity models or a .zip/.tar.gz archive required: true resources: - type: python_script diff --git a/src/methods/scimilarity/script.py b/src/methods/scimilarity/script.py index 050700e6..471464d5 100644 --- a/src/methods/scimilarity/script.py +++ b/src/methods/scimilarity/script.py @@ -1,6 +1,10 @@ import sys import anndata as ad import scimilarity +import os +import zipfile +import tempfile +import tarfile ## VIASH START par = { @@ -25,9 +29,28 @@ f"(dataset_organism == \"{adata.uns['dataset_organism']}\")" ) +if os.path.isdir(par["model"]): + model_temp = None + model_dir = par["model"] +else: + model_temp = tempfile.TemporaryDirectory() + model_dir = model_temp.name + + if zipfile.is_zipfile(par["model"]): + print("Extract SCimilarity model from .zip", flush=True) + with zipfile.ZipFile(par["model"], 'r') as zip_file: + zip_file.extractall(model_dir) + elif tarfile.is_tarfile(par["model"]) and par["model"].endswith('.tar.gz'): + print("Extract SCimilarity model from .tar.gz", flush=True) + with tarfile.open(par["model"], 'r:gz') as tar_file: + tar_file.extractall(model_dir) + model_dir = os.path.join(model_dir, os.listdir(model_dir)[0]) + else: + raise ValueError(f"The 'model' argument should be a directory a .zip file or a .tar.gz file") + print("Load SCimilarity model", flush=True) scimilarity_embedding = scimilarity.cell_embedding.CellEmbedding( - model_path=par["model"] + model_path=model_dir ) print("SCimilarity version:", scimilarity.__version__) @@ -82,3 +105,7 @@ print("Write output to file", flush=True) output.write_h5ad(par["output"], compression="gzip") + +if model_temp is not None: + print("Cleanup model directory", flush=True) + model_temp.cleanup() From 5e0038c8477e074fff87d2e9cc452e34831ed757 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 8 Oct 2024 14:55:05 +0200 Subject: [PATCH 05/23] Add SCimilarity model path to benchmark workflow --- src/workflows/run_benchmark/main.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index a68002c6..e6743d18 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -26,7 +26,9 @@ methods = [ scalex, scanorama, scanvi, - scimilarity, + scimilarity.run( + args: [model: file("https://zenodo.org/records/10685499/files/model_v1.1.tar.gz")] + ), scvi ] From c927cb1aaa2e707b6fb9527f71ac2ac638cc559f Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 8 Oct 2024 15:01:30 +0200 Subject: [PATCH 06/23] Add base_method API to disable tests for SCimilarity --- src/api/comp_method.yaml | 20 +------------------- src/methods/scimilarity/config.vsh.yaml | 2 +- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml index dda52ce0..c8480836 100644 --- a/src/api/comp_method.yaml +++ b/src/api/comp_method.yaml @@ -1,23 +1,5 @@ -namespace: methods -info: - type: method - type_info: - label: Method - summary: A method for the batch integration task. - description: | - A batch integration method which integrates multiple datasets. -arguments: - - name: --input - __merge__: file_dataset.yaml - direction: input - required: true - - name: --output - __merge__: file_integrated.yaml - direction: output - required: true +__merge__: base_method.yaml test_resources: - - type: python_script - path: /common/component_tests/check_config.py - type: python_script path: /common/component_tests/run_and_check_output.py - path: /resources_test/task_batch_integration/cxg_mouse_pancreas_atlas diff --git a/src/methods/scimilarity/config.vsh.yaml b/src/methods/scimilarity/config.vsh.yaml index 9c429a6e..ae66ff58 100644 --- a/src/methods/scimilarity/config.vsh.yaml +++ b/src/methods/scimilarity/config.vsh.yaml @@ -1,4 +1,4 @@ -__merge__: /src/api/comp_method.yaml +__merge__: /src/api/base_method.yaml name: scimilarity label: SCimilarity summary: SCimilarity provides unifying representation of single cell expression profiles From 5c74f375e6dc11082e6965918831665645dad963 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 8 Oct 2024 15:10:35 +0200 Subject: [PATCH 07/23] Replace cxg_mouse_pancreas_atlas with cxg_immune_cell_atlas --- README.md | 11 ++++---- _viash.yaml | 3 -- scripts/create_resources/test_resources.sh | 28 +------------------ src/api/base_method.yaml | 20 +++++++++++++ src/api/comp_control_method.yaml | 4 +-- src/api/comp_method.yaml | 4 +-- src/api/comp_metric.yaml | 4 +-- src/api/comp_process_dataset.yaml | 6 ++-- src/api/comp_transformer.yaml | 8 +++--- src/api/file_common_dataset.yaml | 2 +- src/api/file_dataset.yaml | 2 +- src/api/file_integrated.yaml | 2 +- src/api/file_integrated_full.yaml | 4 +-- src/api/file_solution.yaml | 2 +- .../embed_cell_types/script.py | 6 ++-- .../embed_cell_types_jittered/script.py | 6 ++-- src/control_methods/no_integration/script.py | 2 +- .../no_integration_batch/script.py | 6 ++-- .../shuffle_integration/script.py | 4 +-- .../shuffle_integration_by_batch/script.py | 4 +-- .../script.py | 4 +-- src/data_processors/transform/script.py | 6 ++-- src/methods/batchelor_fastmnn/script.R | 2 +- src/methods/batchelor_mnn_correct/script.R | 2 +- src/methods/bbknn/script.py | 2 +- src/methods/combat/script.py | 2 +- src/methods/harmony/script.R | 2 +- src/methods/harmonypy/script.py | 2 +- src/methods/liger/script.R | 2 +- src/methods/mnnpy/script.py | 2 +- src/methods/pyliger/script.py | 4 +-- src/methods/scalex/script.py | 2 +- src/methods/scanorama/script.py | 2 +- src/methods/scanvi/script.py | 2 +- src/methods/scimilarity/script.py | 2 +- src/methods/scvi/script.py | 2 +- src/metrics/asw_batch/script.py | 2 +- src/metrics/asw_label/script.py | 2 +- src/metrics/cell_cycle_conservation/script.py | 2 +- src/metrics/clustering_overlap/script.py | 4 +-- src/metrics/graph_connectivity/script.py | 2 +- src/metrics/hvg_overlap/script.py | 4 +-- src/metrics/isolated_label_asw/script.py | 4 +-- src/metrics/isolated_label_f1/script.py | 4 +-- src/metrics/kbet/script.py | 4 +-- src/metrics/lisi/script.py | 2 +- src/metrics/pcr/script.py | 4 +-- 47 files changed, 96 insertions(+), 106 deletions(-) create mode 100644 src/api/base_method.yaml diff --git a/README.md b/README.md index 3a460e2b..50c16e34 100644 --- a/README.md +++ b/README.md @@ -91,8 +91,7 @@ flowchart TB A subset of the common dataset. -Example file: -`resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad` +Example file: `resources_test/common/cxg_immune_cell_atlas/dataset.h5ad` Format: @@ -158,7 +157,7 @@ Arguments: Unintegrated AnnData HDF5 file. Example file: -`resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad` +`resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad` Format: @@ -202,7 +201,7 @@ Data structure: Uncensored dataset containing the true labels. Example file: -`resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad` +`resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad` Format: @@ -317,7 +316,7 @@ Arguments: An integrated AnnData dataset. Example file: -`resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated.h5ad` +`resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated.h5ad` Description: @@ -362,7 +361,7 @@ Data structure: An integrated AnnData dataset with additional outputs. Example file: -`resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad` +`resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad` Description: diff --git a/_viash.yaml b/_viash.yaml index d7ef3700..1598a220 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -38,9 +38,6 @@ references: info: image: thumbnail.svg test_resources: - - type: s3 - path: s3://openproblems-data/resources_test/common/cxg_mouse_pancreas_atlas/ - dest: resources_test/common/cxg_mouse_pancreas_atlas - type: s3 path: s3://openproblems-data/resources_test/common/cxg_immune_cell_atlas/ dest: resources_test/common/cxg_immune_cell_atlas diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index f5369253..8f458ccb 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -14,32 +14,17 @@ DATASET_DIR=resources_test/task_batch_integration mkdir -p $DATASET_DIR # process dataset -viash run src/data_processors/process_dataset/config.vsh.yaml -- \ - --input "$RAW_DATA/cxg_mouse_pancreas_atlas/dataset.h5ad" \ - --output_dataset "$DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad" \ - --output_solution "$DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad" - viash run src/data_processors/process_dataset/config.vsh.yaml -- \ --input "$RAW_DATA/cxg_immune_cell_atlas/dataset.h5ad" \ --output_dataset "$DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad" \ --output_solution "$DATASET_DIR/cxg_immune_cell_atlas/solution.h5ad" # run one method -viash run src/methods/combat/config.vsh.yaml -- \ - --input $DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad \ - --output $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated.h5ad - viash run src/methods/combat/config.vsh.yaml -- \ --input $DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad \ --output $DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad # run transformer -viash run src/data_processors/transform/config.vsh.yaml -- \ - --input_integrated $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated.h5ad \ - --input_dataset $DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad \ - --expected_method_types feature \ - --output $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated_full.h5ad - viash run src/data_processors/transform/config.vsh.yaml -- \ --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad \ --input_dataset $DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad \ @@ -47,24 +32,13 @@ viash run src/data_processors/transform/config.vsh.yaml -- \ --output $DATASET_DIR/cxg_immune_cell_atlas/integrated_full.h5ad # run one metric -viash run src/metrics/graph_connectivity/config.vsh.yaml -- \ - --input_integrated $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated_full.h5ad \ - --input_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad \ - --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad - viash run src/metrics/graph_connectivity/config.vsh.yaml -- \ --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated_full.h5ad \ --input_solution $DATASET_DIR/cxg_immune_cell_atlas/solution.h5ad \ --output $DATASET_DIR/cxg_immune_cell_atlas/score.h5ad # write the state file -cat > $DATASET_DIR/state.yaml << HERE -id: cxg_mouse_pancreas_atlas -output_dataset: !file dataset.h5ad -output_solution: !file solution.h5ad -output_integrated: !file integrated.h5ad -output_integrated_full: !file integrated_full.h5ad -output_score: !file score.h5ad +cat > $DATASET_DIR/cxg_immune_cell_atlas/state.yaml << HERE id: cxg_immune_cell_atlas output_dataset: !file dataset_mod1.h5ad output_solution: !file solution_mod1.h5ad diff --git a/src/api/base_method.yaml b/src/api/base_method.yaml new file mode 100644 index 00000000..ed3d5938 --- /dev/null +++ b/src/api/base_method.yaml @@ -0,0 +1,20 @@ +namespace: methods +info: + type: method + type_info: + label: Method + summary: A method for the batch integration task. + description: | + A batch integration method which integrates multiple datasets. +arguments: + - name: --input + __merge__: file_dataset.yaml + direction: input + required: true + - name: --output + __merge__: file_integrated.yaml + direction: output + required: true +test_resources: + - type: python_script + path: /common/component_tests/check_config.py diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index 0ca176f6..b8e1ebd3 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -24,5 +24,5 @@ test_resources: path: /common/component_tests/check_config.py - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/task_batch_integration/cxg_mouse_pancreas_atlas - dest: resources_test/task_batch_integration/cxg_mouse_pancreas_atlas + - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas + dest: resources_test/task_batch_integration/cxg_immune_cell_atlas diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml index c8480836..571c9565 100644 --- a/src/api/comp_method.yaml +++ b/src/api/comp_method.yaml @@ -2,5 +2,5 @@ __merge__: base_method.yaml test_resources: - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/task_batch_integration/cxg_mouse_pancreas_atlas - dest: resources_test/task_batch_integration/cxg_mouse_pancreas_atlas + - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas + dest: resources_test/task_batch_integration/cxg_immune_cell_atlas diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index 73eee377..bc57056a 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -24,5 +24,5 @@ test_resources: path: /common/component_tests/check_config.py - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/task_batch_integration/cxg_mouse_pancreas_atlas - dest: resources_test/task_batch_integration/cxg_mouse_pancreas_atlas + - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas + dest: resources_test/task_batch_integration/cxg_immune_cell_atlas diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_process_dataset.yaml index b2b449aa..067a5c3d 100644 --- a/src/api/comp_process_dataset.yaml +++ b/src/api/comp_process_dataset.yaml @@ -25,7 +25,7 @@ arguments: default: 2000 required: false test_resources: - - path: /resources_test/common/cxg_mouse_pancreas_atlas/ - dest: resources_test/common/cxg_mouse_pancreas_atlas/ + - path: /resources_test/common/cxg_immune_cell_atlas/ + dest: resources_test/common/cxg_immune_cell_atlas/ - type: python_script - path: /common/component_tests/run_and_check_output.py \ No newline at end of file + path: /common/component_tests/run_and_check_output.py diff --git a/src/api/comp_transformer.yaml b/src/api/comp_transformer.yaml index eb347298..b68a9c37 100644 --- a/src/api/comp_transformer.yaml +++ b/src/api/comp_transformer.yaml @@ -6,7 +6,7 @@ info: summary: Check the output and transform to create additional output types description: | This component will: - + - Assert whether the input dataset and integrated dataset have the same shape. - Reorder the integrated dataset to match the input dataset if needed. - Transform the corrected feature output to an embedding. @@ -26,7 +26,7 @@ arguments: required: true multiple: true description: | - The expected output types of the batch integration method. + The expected output types of the batch integration method. choices: [ feature, embedding, graph ] - name: --output __merge__: file_integrated_full.yaml @@ -35,5 +35,5 @@ arguments: test_resources: - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/task_batch_integration/cxg_mouse_pancreas_atlas - dest: resources_test/task_batch_integration/cxg_mouse_pancreas_atlas + - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas + dest: resources_test/task_batch_integration/cxg_immune_cell_atlas diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml index 1399f0b2..171fdeb6 100644 --- a/src/api/file_common_dataset.yaml +++ b/src/api/file_common_dataset.yaml @@ -2,7 +2,7 @@ # `src/datasets/api/file_common_dataset.yaml`. However, some fields # such as obs.cell_type and obs.batch are now required type: file -example: "resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad" +example: "resources_test/common/cxg_immune_cell_atlas/dataset.h5ad" label: "Common Dataset" summary: A subset of the common dataset. info: diff --git a/src/api/file_dataset.yaml b/src/api/file_dataset.yaml index 8f60192b..a76ae203 100644 --- a/src/api/file_dataset.yaml +++ b/src/api/file_dataset.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad" +example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad" label: "Dataset" summary: Unintegrated AnnData HDF5 file. info: diff --git a/src/api/file_integrated.yaml b/src/api/file_integrated.yaml index abd6df29..7920fcd0 100644 --- a/src/api/file_integrated.yaml +++ b/src/api/file_integrated.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated.h5ad" +example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated.h5ad" label: Integration summary: An integrated AnnData dataset. description: | diff --git a/src/api/file_integrated_full.yaml b/src/api/file_integrated_full.yaml index 4d02f596..cdedb854 100644 --- a/src/api/file_integrated_full.yaml +++ b/src/api/file_integrated_full.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad" +example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad" label: Transformed integration summary: An integrated AnnData dataset with additional outputs. description: | @@ -8,7 +8,7 @@ description: | - Feature: the corrected_counts layer - Embedding: the X_emb obsm - Graph: the connectivities and distances obsp - + The Graph should always be present, but the Feature and Embedding are optional. info: format: diff --git a/src/api/file_solution.yaml b/src/api/file_solution.yaml index 35e0c7ea..562bfa22 100644 --- a/src/api/file_solution.yaml +++ b/src/api/file_solution.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad" +example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad" label: "Solution" summary: Uncensored dataset containing the true labels. info: diff --git a/src/control_methods/embed_cell_types/script.py b/src/control_methods/embed_cell_types/script.py index 5482d301..f6f1961b 100644 --- a/src/control_methods/embed_cell_types/script.py +++ b/src/control_methods/embed_cell_types/script.py @@ -2,11 +2,11 @@ ## VIASH START par = { - 'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', - 'input_solution': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad', + 'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', + 'input_solution': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad', 'output': 'output.h5ad', } -meta = { +meta = { 'functionality': 'foo', 'config': 'bar' } diff --git a/src/control_methods/embed_cell_types_jittered/script.py b/src/control_methods/embed_cell_types_jittered/script.py index 9ad3e743..06180464 100644 --- a/src/control_methods/embed_cell_types_jittered/script.py +++ b/src/control_methods/embed_cell_types_jittered/script.py @@ -4,13 +4,13 @@ ## VIASH START par = { - 'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', - 'input_solution': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad', + 'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', + 'input_solution': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad', 'output': 'output.h5ad', 'jitter': 0.01, } -meta = { +meta = { 'functionality': 'foo', 'config': 'bar' } diff --git a/src/control_methods/no_integration/script.py b/src/control_methods/no_integration/script.py index 0c1581be..df7b280d 100644 --- a/src/control_methods/no_integration/script.py +++ b/src/control_methods/no_integration/script.py @@ -2,7 +2,7 @@ ## VIASH START par = { - 'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } ## VIASH END diff --git a/src/control_methods/no_integration_batch/script.py b/src/control_methods/no_integration_batch/script.py index 8324acf9..1f62763c 100644 --- a/src/control_methods/no_integration_batch/script.py +++ b/src/control_methods/no_integration_batch/script.py @@ -5,11 +5,11 @@ ## VIASH START par = { - 'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } -meta = { +meta = { 'functionality': 'foo', 'config': 'bar' } @@ -46,4 +46,4 @@ print("Store outputs", flush=True) adata.uns['method_id'] = meta['name'] -adata.write_h5ad(par['output'], compression='gzip') \ No newline at end of file +adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/control_methods/shuffle_integration/script.py b/src/control_methods/shuffle_integration/script.py index 91a542af..e1f29318 100644 --- a/src/control_methods/shuffle_integration/script.py +++ b/src/control_methods/shuffle_integration/script.py @@ -3,10 +3,10 @@ ## VIASH START par = { - 'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } -meta = { +meta = { "resources_dir": "src/tasks/batch_integration/control_methods/" } ## VIASH END diff --git a/src/control_methods/shuffle_integration_by_batch/script.py b/src/control_methods/shuffle_integration_by_batch/script.py index c7d35171..a9b63edc 100644 --- a/src/control_methods/shuffle_integration_by_batch/script.py +++ b/src/control_methods/shuffle_integration_by_batch/script.py @@ -3,10 +3,10 @@ ## VIASH START par = { - 'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } -meta = { +meta = { "resources_dir": "src/tasks/batch_integration/control_methods/" } ## VIASH END diff --git a/src/control_methods/shuffle_integration_by_cell_type/script.py b/src/control_methods/shuffle_integration_by_cell_type/script.py index 762bd07b..0df2ba46 100644 --- a/src/control_methods/shuffle_integration_by_cell_type/script.py +++ b/src/control_methods/shuffle_integration_by_cell_type/script.py @@ -3,10 +3,10 @@ ## VIASH START par = { - 'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } -meta = { +meta = { "resources_dir": "src/tasks/batch_integration/control_methods/" } ## VIASH END diff --git a/src/data_processors/transform/script.py b/src/data_processors/transform/script.py index dc01584a..226edca8 100644 --- a/src/data_processors/transform/script.py +++ b/src/data_processors/transform/script.py @@ -3,8 +3,8 @@ ## VIASH START par = { - "input_integrated": "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated.h5ad", - "input_dataset": "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad", + "input_integrated": "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated.h5ad", + "input_dataset": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", "expected_method_types": ["feature"], "ouput": "output.h5ad" } @@ -28,7 +28,7 @@ if "corrected_counts" in integrated.layers.keys(): assert integrated.shape[1] == dataset.shape[1], "Number of genes do not match" - + if not integrated.var.index.equals(dataset.var.index): assert integrated.var.index.sort_values().equals(dataset.var.index.sort_values()), "Gene names do not match" print("Reordering genes", flush=True) diff --git a/src/methods/batchelor_fastmnn/script.R b/src/methods/batchelor_fastmnn/script.R index 76791bea..879aad68 100644 --- a/src/methods/batchelor_fastmnn/script.R +++ b/src/methods/batchelor_fastmnn/script.R @@ -8,7 +8,7 @@ suppressPackageStartupMessages({ ## VIASH START par <- list( - input = 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + input = 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', output = 'output.h5ad' ) meta <- list( diff --git a/src/methods/batchelor_mnn_correct/script.R b/src/methods/batchelor_mnn_correct/script.R index cadbcc82..4a8802af 100644 --- a/src/methods/batchelor_mnn_correct/script.R +++ b/src/methods/batchelor_mnn_correct/script.R @@ -7,7 +7,7 @@ suppressPackageStartupMessages({ }) ## VIASH START par <- list( - input = 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + input = 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', output = 'output.h5ad' ) meta <- list( diff --git a/src/methods/bbknn/script.py b/src/methods/bbknn/script.py index 86c807ed..9c121ccb 100644 --- a/src/methods/bbknn/script.py +++ b/src/methods/bbknn/script.py @@ -5,7 +5,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', 'annoy_n_trees': 10, 'neighbors_within_batch': 3, diff --git a/src/methods/combat/script.py b/src/methods/combat/script.py index 155c1621..ab251363 100644 --- a/src/methods/combat/script.py +++ b/src/methods/combat/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } meta = { diff --git a/src/methods/harmony/script.R b/src/methods/harmony/script.R index e5cb2c5b..595e3f19 100644 --- a/src/methods/harmony/script.R +++ b/src/methods/harmony/script.R @@ -5,7 +5,7 @@ requireNamespace("harmony", quietly = TRUE) ## VIASH START par <- list( - input = 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + input = 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', output = 'output.h5ad' ) meta <- list( diff --git a/src/methods/harmonypy/script.py b/src/methods/harmonypy/script.py index 79b32537..ec851953 100644 --- a/src/methods/harmonypy/script.py +++ b/src/methods/harmonypy/script.py @@ -5,7 +5,7 @@ ## VIASH START par = { - "input": "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad", + "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", "output": "output.h5ad" } meta = { diff --git a/src/methods/liger/script.R b/src/methods/liger/script.R index 62dec598..e5b7e451 100644 --- a/src/methods/liger/script.R +++ b/src/methods/liger/script.R @@ -4,7 +4,7 @@ requireNamespace("rliger", quietly = TRUE) ## VIASH START par <- list( - input = "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad", + input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", output = "output.h5ad" ) meta <- list( diff --git a/src/methods/mnnpy/script.py b/src/methods/mnnpy/script.py index a9dfd8a8..7100da10 100644 --- a/src/methods/mnnpy/script.py +++ b/src/methods/mnnpy/script.py @@ -3,7 +3,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } meta = { diff --git a/src/methods/pyliger/script.py b/src/methods/pyliger/script.py index 603b6d04..c6bd5f0e 100644 --- a/src/methods/pyliger/script.py +++ b/src/methods/pyliger/script.py @@ -5,7 +5,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad' } meta = { @@ -31,7 +31,7 @@ adata_per_batch = [] for batch in adata.obs['batch'].unique(): adb = adata[adata.obs['batch'] == batch].copy() - + # save row sum and sum of squares for further use norm_sum = np.ravel(np.sum(adb.layers["norm_data"], axis=0)) norm_sum_sq = np.ravel(np.sum(adb.layers["norm_data"].power(2), axis=0)) diff --git a/src/methods/scalex/script.py b/src/methods/scalex/script.py index 887a989d..7d09f02f 100644 --- a/src/methods/scalex/script.py +++ b/src/methods/scalex/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } meta = { diff --git a/src/methods/scanorama/script.py b/src/methods/scanorama/script.py index 8f99418c..2ddb91df 100644 --- a/src/methods/scanorama/script.py +++ b/src/methods/scanorama/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } meta = { diff --git a/src/methods/scanvi/script.py b/src/methods/scanvi/script.py index 882d7ff6..5a17d2e9 100644 --- a/src/methods/scanvi/script.py +++ b/src/methods/scanvi/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', 'n_hvg': 2000, 'n_latent': 30, diff --git a/src/methods/scimilarity/script.py b/src/methods/scimilarity/script.py index 471464d5..7117a9d6 100644 --- a/src/methods/scimilarity/script.py +++ b/src/methods/scimilarity/script.py @@ -8,7 +8,7 @@ ## VIASH START par = { - "input": "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad", + "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", "output": "output.h5ad", "model": "model_v1.1", } diff --git a/src/methods/scvi/script.py b/src/methods/scvi/script.py index b6836b49..20f1cf32 100644 --- a/src/methods/scvi/script.py +++ b/src/methods/scvi/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', 'n_hvg': 2000, 'n_latent': 30, diff --git a/src/metrics/asw_batch/script.py b/src/metrics/asw_batch/script.py index d6dafcfe..4a7269da 100644 --- a/src/metrics/asw_batch/script.py +++ b/src/metrics/asw_batch/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } meta = { diff --git a/src/metrics/asw_label/script.py b/src/metrics/asw_label/script.py index 499a06f9..e307aaac 100644 --- a/src/metrics/asw_label/script.py +++ b/src/metrics/asw_label/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } diff --git a/src/metrics/cell_cycle_conservation/script.py b/src/metrics/cell_cycle_conservation/script.py index 9ad38422..b254f4f8 100644 --- a/src/metrics/cell_cycle_conservation/script.py +++ b/src/metrics/cell_cycle_conservation/script.py @@ -5,7 +5,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad' } diff --git a/src/metrics/clustering_overlap/script.py b/src/metrics/clustering_overlap/script.py index 30fe1704..2254acb0 100644 --- a/src/metrics/clustering_overlap/script.py +++ b/src/metrics/clustering_overlap/script.py @@ -6,7 +6,7 @@ ## VIASH START par = { - 'adata_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'adata_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } @@ -50,4 +50,4 @@ ) print("Write data to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/metrics/graph_connectivity/script.py b/src/metrics/graph_connectivity/script.py index 0c92a35a..6148884e 100644 --- a/src/metrics/graph_connectivity/script.py +++ b/src/metrics/graph_connectivity/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } meta = { diff --git a/src/metrics/hvg_overlap/script.py b/src/metrics/hvg_overlap/script.py index 8ecda9bc..b902fe08 100644 --- a/src/metrics/hvg_overlap/script.py +++ b/src/metrics/hvg_overlap/script.py @@ -4,8 +4,8 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', - 'input_solution': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', + 'input_solution': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad', 'output': 'output.h5ad', } meta = { diff --git a/src/metrics/isolated_label_asw/script.py b/src/metrics/isolated_label_asw/script.py index 39d23568..602e8d16 100644 --- a/src/metrics/isolated_label_asw/script.py +++ b/src/metrics/isolated_label_asw/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } @@ -46,4 +46,4 @@ ) print('Write data to file', flush=True) -output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/metrics/isolated_label_f1/script.py b/src/metrics/isolated_label_f1/script.py index a6529adb..2737f244 100644 --- a/src/metrics/isolated_label_f1/script.py +++ b/src/metrics/isolated_label_f1/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } @@ -45,4 +45,4 @@ ) print('Write data to file', flush=True) -output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/metrics/kbet/script.py b/src/metrics/kbet/script.py index 6c74c261..89bd799e 100644 --- a/src/metrics/kbet/script.py +++ b/src/metrics/kbet/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } @@ -46,4 +46,4 @@ ) print('Write data to file', flush=True) -output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/metrics/lisi/script.py b/src/metrics/lisi/script.py index b50f6e62..c0c564cd 100644 --- a/src/metrics/lisi/script.py +++ b/src/metrics/lisi/script.py @@ -5,7 +5,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } meta = { diff --git a/src/metrics/pcr/script.py b/src/metrics/pcr/script.py index 265ad430..0ae18ddb 100644 --- a/src/metrics/pcr/script.py +++ b/src/metrics/pcr/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } @@ -59,4 +59,4 @@ print('Write data to file', flush=True) -output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file +output.write_h5ad(par['output'], compression='gzip') From 7b6fea33c1cf9970693c5ca8b57158ae77dd1b41 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 8 Oct 2024 15:16:11 +0200 Subject: [PATCH 08/23] Style SCimiliarity script --- src/methods/scimilarity/script.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/methods/scimilarity/script.py b/src/methods/scimilarity/script.py index 7117a9d6..2da1790e 100644 --- a/src/methods/scimilarity/script.py +++ b/src/methods/scimilarity/script.py @@ -1,11 +1,12 @@ -import sys -import anndata as ad -import scimilarity import os -import zipfile +import sys import tempfile +import zipfile import tarfile +import anndata as ad +import scimilarity + ## VIASH START par = { "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", @@ -38,20 +39,20 @@ if zipfile.is_zipfile(par["model"]): print("Extract SCimilarity model from .zip", flush=True) - with zipfile.ZipFile(par["model"], 'r') as zip_file: + with zipfile.ZipFile(par["model"], "r") as zip_file: zip_file.extractall(model_dir) - elif tarfile.is_tarfile(par["model"]) and par["model"].endswith('.tar.gz'): + elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"): print("Extract SCimilarity model from .tar.gz", flush=True) - with tarfile.open(par["model"], 'r:gz') as tar_file: + with tarfile.open(par["model"], "r:gz") as tar_file: tar_file.extractall(model_dir) model_dir = os.path.join(model_dir, os.listdir(model_dir)[0]) else: - raise ValueError(f"The 'model' argument should be a directory a .zip file or a .tar.gz file") + raise ValueError( + f"The 'model' argument should be a directory a .zip file or a .tar.gz file" + ) print("Load SCimilarity model", flush=True) -scimilarity_embedding = scimilarity.cell_embedding.CellEmbedding( - model_path=model_dir -) +scimilarity_embedding = scimilarity.cell_embedding.CellEmbedding(model_path=model_dir) print("SCimilarity version:", scimilarity.__version__) print("Create input data", flush=True) From cca571570786dd2a55424b1cfb521a8c8504b5fe Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 8 Oct 2024 15:49:58 +0200 Subject: [PATCH 09/23] Remove test resources from SCimiliarity config --- src/methods/scimilarity/config.vsh.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/methods/scimilarity/config.vsh.yaml b/src/methods/scimilarity/config.vsh.yaml index ae66ff58..02b6527c 100644 --- a/src/methods/scimilarity/config.vsh.yaml +++ b/src/methods/scimilarity/config.vsh.yaml @@ -21,13 +21,6 @@ resources: - type: python_script path: script.py - path: /src/utils/read_anndata_partial.py -test_resources: - - type: python_script - path: /common/component_tests/check_config.py - - type: python_script - path: /common/component_tests/run_and_check_output.py - - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas - dest: resources_test/task_batch_integration/cxg_immune_cell_atlas engines: - type: docker image: openproblems/base_pytorch_nvidia:1.0.0 From 30e8b14b64230753abfede2e17e0866b06b0f125 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Fri, 11 Oct 2024 14:25:00 +0200 Subject: [PATCH 10/23] Fix file names in test resources state.yaml --- scripts/create_resources/test_resources.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index 8f458ccb..49d2fd93 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -40,10 +40,10 @@ viash run src/metrics/graph_connectivity/config.vsh.yaml -- \ # write the state file cat > $DATASET_DIR/cxg_immune_cell_atlas/state.yaml << HERE id: cxg_immune_cell_atlas -output_dataset: !file dataset_mod1.h5ad -output_solution: !file solution_mod1.h5ad -output_integrated: !file integrated_mod1.h5ad -output_integrated_full: !file integrated_full_mod1.h5ad +output_dataset: !file dataset.h5ad +output_solution: !file solution.h5ad +output_integrated: !file integrated.h5ad +output_integrated_full: !file integrated_full.h5ad output_score: !file score_mod1.h5ad HERE From b2188b5f9011c94a243ec758cf3b32ceb055d556 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Fri, 11 Oct 2024 14:26:22 +0200 Subject: [PATCH 11/23] Add scimilarity as dependency to benchmark workflow --- src/workflows/run_benchmark/config.vsh.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index f70f9b43..3ed43a1e 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -80,6 +80,7 @@ dependencies: - name: methods/scalex - name: methods/scanorama - name: methods/scanvi + - name: methods/scimilarity - name: methods/scvi # metrics - name: metrics/asw_batch From ca95e446a49c458c97179396846b7f30a22047c7 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 16 Oct 2024 15:30:24 +0200 Subject: [PATCH 12/23] Update compute environment --- scripts/create_resources/resources.sh | 2 +- scripts/run_benchmark/run_full_seqeracloud.sh | 2 +- scripts/run_benchmark/run_test_seqeracloud.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh index 58ac28a1..66b4eefb 100755 --- a/scripts/create_resources/resources.sh +++ b/scripts/create_resources/resources.sh @@ -19,7 +19,7 @@ tw launch https://github.com/openproblems-bio/task_batch_integration.git \ --pull-latest \ --main-script target/nextflow/workflows/process_datasets/main.nf \ --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --compute-env 6UWsS5iw7TI37saKo2wcMi \ --params-file /tmp/params.yaml \ --entry-name auto \ --config common/nextflow_helpers/labels_tw.config \ diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh index 8f4bc92a..1e980239 100755 --- a/scripts/run_benchmark/run_full_seqeracloud.sh +++ b/scripts/run_benchmark/run_full_seqeracloud.sh @@ -25,7 +25,7 @@ tw launch https://github.com/openproblems-bio/task_batch_integration.git \ --pull-latest \ --main-script target/nextflow/workflows/run_benchmark/main.nf \ --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --compute-env 6UWsS5iw7TI37saKo2wcMi \ --params-file /tmp/params.yaml \ --entry-name auto \ --config common/nextflow_helpers/labels_tw.config \ diff --git a/scripts/run_benchmark/run_test_seqeracloud.sh b/scripts/run_benchmark/run_test_seqeracloud.sh index 64056313..3645ad0f 100755 --- a/scripts/run_benchmark/run_test_seqeracloud.sh +++ b/scripts/run_benchmark/run_test_seqeracloud.sh @@ -21,7 +21,7 @@ tw launch https://github.com/openproblems-bio/task_batch_integration.git \ --pull-latest \ --main-script target/nextflow/workflows/run_benchmark/main.nf \ --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --compute-env 6UWsS5iw7TI37saKo2wcMi \ --params-file /tmp/params.yaml \ --entry-name auto \ --config common/nextflow_helpers/labels_tw.config \ From ce57335b75899a40e4fbc433330f671c32f280df Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Mon, 21 Oct 2024 16:38:36 +0200 Subject: [PATCH 13/23] Update model file path --- src/workflows/run_benchmark/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index e6743d18..2eff6d8d 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -27,7 +27,7 @@ methods = [ scanorama, scanvi, scimilarity.run( - args: [model: file("https://zenodo.org/records/10685499/files/model_v1.1.tar.gz")] + args: [model: file("s3://openproblems-work/cache/scimilarity-model_v1.1.tar.gz")] ), scvi ] From c60a7daafa8422042c0328eaee027d223dd5f091 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 22 Oct 2024 08:42:14 +0200 Subject: [PATCH 14/23] Create geneformer files --- src/methods/geneformer/config.vsh.yaml | 44 ++++++++++++++++++++++++++ src/methods/geneformer/script.py | 31 ++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 src/methods/geneformer/config.vsh.yaml create mode 100644 src/methods/geneformer/script.py diff --git a/src/methods/geneformer/config.vsh.yaml b/src/methods/geneformer/config.vsh.yaml new file mode 100644 index 00000000..f38592ac --- /dev/null +++ b/src/methods/geneformer/config.vsh.yaml @@ -0,0 +1,44 @@ +__merge__: ../../api/comp_method.yaml + +name: geneformer +label: Geneformer +summary: Geneformer is a foundation transformer model pretrained on a large-scale corpus of single cell transcriptomes +description: | + Geneformer is a foundation transformer model pretrained on a large-scale + corpus of single cell transcriptomes to enable context-aware predictions in + network biology. For this task, Geneformer is used to create a batch-corrected + cell embedding. +references: + doi: + - 10.1038/s41586-023-06139-9 + - 10.1101/2024.08.16.608180 +links: + documentation: https://geneformer.readthedocs.io/en/latest/index.html + repository: https://huggingface.co/ctheodoris/Geneformer + +info: + preferred_normalization: counts + preferred_types: [embedding] + +arguments: + - name: "--model" + type: "file" + description: Path to a Geneformer model file + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pip: + - git+https://huggingface.co/ctheodoris/Geneformer.git + +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/methods/geneformer/script.py b/src/methods/geneformer/script.py new file mode 100644 index 00000000..168865cd --- /dev/null +++ b/src/methods/geneformer/script.py @@ -0,0 +1,31 @@ +import anndata as ad + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", + 'output': 'output.h5ad' +} +meta = { + 'name': 'my_python_method' +} +## VIASH END + +print('Reading input files', flush=True) +input = ad.read_h5ad(par['input']) + +print('Preprocess data', flush=True) +# ... preprocessing ... + +print('Train model', flush=True) +# ... train model ... + +print('Generate predictions', flush=True) +# ... generate predictions ... + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + +) +output.write_h5ad(par['output'], compression='gzip') From 99a7078961df5180a872661b33e08e3c9c46bc45 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 22 Oct 2024 13:56:14 +0200 Subject: [PATCH 15/23] Set SCimilarity name in Python script --- src/methods/scimilarity/script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/methods/scimilarity/script.py b/src/methods/scimilarity/script.py index 2da1790e..761d59a5 100644 --- a/src/methods/scimilarity/script.py +++ b/src/methods/scimilarity/script.py @@ -14,7 +14,7 @@ "model": "model_v1.1", } meta = { - "name": "scvi", + "name": "scimilarity", } ## VIASH END From f4b98e137149a2426563620cb07d3f57e5a5aad1 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 22 Oct 2024 13:57:00 +0200 Subject: [PATCH 16/23] Adjust container settings Depend on base method config because of input model file --- src/methods/geneformer/config.vsh.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/methods/geneformer/config.vsh.yaml b/src/methods/geneformer/config.vsh.yaml index f38592ac..66287f01 100644 --- a/src/methods/geneformer/config.vsh.yaml +++ b/src/methods/geneformer/config.vsh.yaml @@ -1,4 +1,4 @@ -__merge__: ../../api/comp_method.yaml +__merge__: /src/api/base_method.yaml name: geneformer label: Geneformer @@ -31,14 +31,15 @@ resources: engines: - type: docker - image: openproblems/base_python:1.0.0 + image: openproblems/base_pytorch_nvidia:1.0.0 setup: - type: python pip: + - pyarrow<15.0.0a0,>=14.0.1 - git+https://huggingface.co/ctheodoris/Geneformer.git runners: - type: executable - type: nextflow directives: - label: [midtime, midmem, midcpu] + label: [midtime, midmem, midcpu, gpu] From fbaf8b17e36feea5fc68fbebe871ed702585839b Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 22 Oct 2024 14:46:07 +0200 Subject: [PATCH 17/23] Download dictionary files in script --- src/methods/geneformer/script.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/src/methods/geneformer/script.py b/src/methods/geneformer/script.py index 168865cd..8b20c6cb 100644 --- a/src/methods/geneformer/script.py +++ b/src/methods/geneformer/script.py @@ -1,20 +1,47 @@ import anndata as ad +from geneformer import TranscriptomeTokenizer +from tempfile import TemporaryDirectory +import os +import requests ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes # in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. par = { - "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad' } meta = { - 'name': 'my_python_method' + 'name': 'geneformer' } ## VIASH END print('Reading input files', flush=True) input = ad.read_h5ad(par['input']) +n_processors = os.cpu_count() + +# Mapping files for the 30M model +base_dictionary_url = "https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m" +dictionary_files = { + "ensembl_mapping" : "ensembl_mapping_dict_gc30M.pkl", + "gene_median" : "gene_median_dictionary_gc30M.pkl", + "gene_name_id" : "gene_name_id_dict_gc30M.pkl", + "token" : "token_dictionary_gc30M.pkl" +} +dictionary_dir = TemporaryDirectory() +for file in dictionary_files.values(): + url = os.path.join(base_dictionary_url, file) + response = requests.get(url) + with open(os.path.join(dictionary_dir.name, file), 'wb') as f: + f.write(response.content) + +# Set parameters for the 30M model +model_input_size = 2048 +special_token = False +tokenizer = TranscriptomeTokenizer(nproc = n_processors, model_input_size = model_input_size, special_token = special_token) +# tokenizer.tokenize_data(data_directory, output_directory, output_prefix, file_format = "anndata") + print('Preprocess data', flush=True) # ... preprocessing ... From b1d152014fa68ba985150f009c12ca16850c7de7 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 23 Oct 2024 13:54:05 +0200 Subject: [PATCH 18/23] Prepare and tokenize data, attempt to embed --- src/methods/geneformer/config.vsh.yaml | 2 + src/methods/geneformer/script.py | 133 ++++++++++++++++++------- 2 files changed, 101 insertions(+), 34 deletions(-) diff --git a/src/methods/geneformer/config.vsh.yaml b/src/methods/geneformer/config.vsh.yaml index 66287f01..07c89788 100644 --- a/src/methods/geneformer/config.vsh.yaml +++ b/src/methods/geneformer/config.vsh.yaml @@ -28,6 +28,7 @@ arguments: resources: - type: python_script path: script.py + - path: /src/utils/read_anndata_partial.py engines: - type: docker @@ -36,6 +37,7 @@ engines: - type: python pip: - pyarrow<15.0.0a0,>=14.0.1 + - huggingface_hub - git+https://huggingface.co/ctheodoris/Geneformer.git runners: diff --git a/src/methods/geneformer/script.py b/src/methods/geneformer/script.py index 8b20c6cb..7c9e78bb 100644 --- a/src/methods/geneformer/script.py +++ b/src/methods/geneformer/script.py @@ -1,58 +1,123 @@ import anndata as ad -from geneformer import TranscriptomeTokenizer -from tempfile import TemporaryDirectory +from geneformer import TranscriptomeTokenizer, EmbExtractor import os -import requests +import sys +from tempfile import TemporaryDirectory +from huggingface_hub import hf_hub_download +import numpy as np ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes # in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. par = { - 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'name': 'geneformer' + "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", + "output": "output.h5ad", } +meta = {"name": "geneformer"} ## VIASH END -print('Reading input files', flush=True) -input = ad.read_h5ad(par['input']) - n_processors = os.cpu_count() +print("Reading input", flush=True) +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + +adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns") + +if adata.uns["dataset_organism"] != "homo_sapiens": + raise ValueError( + f"Geneformer can only be used with human data " + f"(dataset_organism == \"{adata.uns['dataset_organism']}\")" + ) + +is_ensembl = all(var_name.startswith("ENSG") for var_name in adata.var_names) +if not is_ensembl: + raise ValueError(f"Geneformer requires adata.var_names to contain ENSEMBL gene ids") + +print("Creating working directory", flush=True) +work_dir = TemporaryDirectory() +input_dir = os.path.join(work_dir.name, "input") +os.makedirs(input_dir) +tokenized_dir = os.path.join(work_dir.name, "tokenized") +os.makedirs(tokenized_dir) +embedding_dir = os.path.join(work_dir.name, "embedding") +os.makedirs(embedding_dir) +print(f"Working directory: {work_dir.name}", flush=True) + +print("Preparing data", flush=True) +adata.var["ensembl_id"] = adata.var_names +adata.obs["n_counts"] = np.ravel(adata.X.sum(axis=1)) +adata.write_h5ad(os.path.join(input_dir, "input.h5ad")) +print(adata) + +print("Getting dictionary files", flush=True) # Mapping files for the 30M model -base_dictionary_url = "https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m" dictionary_files = { - "ensembl_mapping" : "ensembl_mapping_dict_gc30M.pkl", - "gene_median" : "gene_median_dictionary_gc30M.pkl", - "gene_name_id" : "gene_name_id_dict_gc30M.pkl", - "token" : "token_dictionary_gc30M.pkl" + "ensembl_mapping": hf_hub_download( + repo_id="ctheodoris/Geneformer", + subfolder="geneformer/gene_dictionaries_30m", + filename="ensembl_mapping_dict_gc30M.pkl", + ), + "gene_median": hf_hub_download( + repo_id="ctheodoris/Geneformer", + subfolder="geneformer/gene_dictionaries_30m", + filename="gene_median_dictionary_gc30M.pkl", + ), + "gene_name_id": hf_hub_download( + repo_id="ctheodoris/Geneformer", + subfolder="geneformer/gene_dictionaries_30m", + filename="gene_name_id_dict_gc30M.pkl", + ), + "token": hf_hub_download( + repo_id="ctheodoris/Geneformer", + subfolder="geneformer/gene_dictionaries_30m", + filename="token_dictionary_gc30M.pkl", + ), } -dictionary_dir = TemporaryDirectory() -for file in dictionary_files.values(): - url = os.path.join(base_dictionary_url, file) - response = requests.get(url) - with open(os.path.join(dictionary_dir.name, file), 'wb') as f: - f.write(response.content) +print("Tokenizing data", flush=True) # Set parameters for the 30M model model_input_size = 2048 special_token = False -tokenizer = TranscriptomeTokenizer(nproc = n_processors, model_input_size = model_input_size, special_token = special_token) -# tokenizer.tokenize_data(data_directory, output_directory, output_prefix, file_format = "anndata") +tokenizer = TranscriptomeTokenizer( + nproc=n_processors, + model_input_size=model_input_size, + special_token=special_token, + gene_median_file=dictionary_files["gene_median"], + token_dictionary_file=dictionary_files["token"], + gene_mapping_file=dictionary_files["ensembl_mapping"], +) -print('Preprocess data', flush=True) -# ... preprocessing ... +tokenizer.tokenize_data(input_dir, tokenized_dir, "tokenized", file_format="h5ad") + +print("Getting model files", flush=True) +model_files = { + "model": hf_hub_download( + repo_id="ctheodoris/Geneformer", + subfolder="gf-6L-30M-i2048", + filename="model.safetensors", + ), + "config": hf_hub_download( + repo_id="ctheodoris/Geneformer", + subfolder="gf-6L-30M-i2048", + filename="config.json", + ), +} +model_dir = os.path.dirname(model_files["model"]) -print('Train model', flush=True) -# ... train model ... +print("Extracting embeddings", flush=True) +embedder = EmbExtractor( + emb_mode="cell", max_ncells=None, token_dictionary_file=dictionary_files["token"] +) +embedder.extract_embs( + model_dir, + os.path.join(tokenized_dir, "tokenized.dataset"), + embedding_dir, + "embedding", +) -print('Generate predictions', flush=True) -# ... generate predictions ... +# TODO: Get embedding from output directory, store and save output print("Write output AnnData to file", flush=True) -output = ad.AnnData( - -) -output.write_h5ad(par['output'], compression='gzip') +output = ad.AnnData() +output.write_h5ad(par["output"], compression="gzip") From e3abd305fcff89c1a4e3cdbc12b3406fca490df5 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Mon, 28 Oct 2024 15:40:08 +0000 Subject: [PATCH 19/23] Store and output embedding --- src/methods/geneformer/script.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/methods/geneformer/script.py b/src/methods/geneformer/script.py index 7c9e78bb..70491de3 100644 --- a/src/methods/geneformer/script.py +++ b/src/methods/geneformer/script.py @@ -5,6 +5,7 @@ from tempfile import TemporaryDirectory from huggingface_hub import hf_hub_download import numpy as np +import pandas as pd ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes @@ -115,9 +116,21 @@ embedding_dir, "embedding", ) +embedding = pd.read_csv(os.path.join(embedding_dir, "embedding.csv")).to_numpy() -# TODO: Get embedding from output directory, store and save output +print("Store outputs", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + obsm={ + "X_emb": embedding, + }, + uns={ + "dataset_id": adata.uns["dataset_id"], + "normalization_id": adata.uns["normalization_id"], + "method_id": meta["name"], + }, +) print("Write output AnnData to file", flush=True) -output = ad.AnnData() output.write_h5ad(par["output"], compression="gzip") From 1e00a52f853a11456a8a8ecec986a667d732edb7 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 29 Oct 2024 08:34:26 +0000 Subject: [PATCH 20/23] Add Geneformer to benchmark workflow --- src/methods/geneformer/config.vsh.yaml | 2 +- src/workflows/run_benchmark/config.vsh.yaml | 1 + src/workflows/run_benchmark/main.nf | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/methods/geneformer/config.vsh.yaml b/src/methods/geneformer/config.vsh.yaml index 07c89788..dc1b04eb 100644 --- a/src/methods/geneformer/config.vsh.yaml +++ b/src/methods/geneformer/config.vsh.yaml @@ -18,7 +18,7 @@ links: info: preferred_normalization: counts - preferred_types: [embedding] + method_types: [embedding] arguments: - name: "--model" diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 51e482ab..d3cc2b55 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -85,6 +85,7 @@ dependencies: - name: methods/batchelor_mnn_correct - name: methods/bbknn - name: methods/combat + - name: methods/geneformer - name: methods/harmony - name: methods/harmonypy - name: methods/liger diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 69322a1a..89564bd5 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -20,6 +20,7 @@ methods = [ batchelor_mnn_correct, bbknn, combat, + geneformer, harmony, harmonypy, liger, From 113a892ea5339a33f10a68b1d6427f9f2ff0f815 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 29 Oct 2024 09:36:11 +0000 Subject: [PATCH 21/23] Add argument to select model version to use --- src/methods/geneformer/config.vsh.yaml | 17 ++++- src/methods/geneformer/script.py | 93 +++++++++++++++----------- 2 files changed, 69 insertions(+), 41 deletions(-) diff --git a/src/methods/geneformer/config.vsh.yaml b/src/methods/geneformer/config.vsh.yaml index dc1b04eb..832614a4 100644 --- a/src/methods/geneformer/config.vsh.yaml +++ b/src/methods/geneformer/config.vsh.yaml @@ -1,4 +1,4 @@ -__merge__: /src/api/base_method.yaml +__merge__: /src/api/comp_method.yaml name: geneformer label: Geneformer @@ -19,11 +19,22 @@ links: info: preferred_normalization: counts method_types: [embedding] + variants: + geneformer_12L_95M_i4096: + model: "gf-12L-95M-i4096" + geneformer_6L_30M_i2048: + model: "gf-6L-30M-i2048" + geneformer_12L_30M_i2048: + model: "gf-12L-30M-i2048" + geneformer_20L_95M_i4096: + model: "gf-20L-95M-i4096" arguments: - name: "--model" - type: "file" - description: Path to a Geneformer model file + type: "string" + description: String representing the Geneformer model to use + choices: ["gf-6L-30M-i2048", "gf-12L-30M-i2048", "gf-12L-95M-i4096", "gf-20L-95M-i4096"] + default: "gf-12L-95M-i4096" resources: - type: python_script diff --git a/src/methods/geneformer/script.py b/src/methods/geneformer/script.py index 70491de3..6c9a373e 100644 --- a/src/methods/geneformer/script.py +++ b/src/methods/geneformer/script.py @@ -13,13 +13,14 @@ par = { "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", "output": "output.h5ad", + "model": "gf-12L-95M-i4096" } meta = {"name": "geneformer"} ## VIASH END n_processors = os.cpu_count() -print("Reading input", flush=True) +print(">>> Reading input...", flush=True) sys.path.append(meta["resources_dir"]) from read_anndata_partial import read_anndata @@ -28,85 +29,99 @@ if adata.uns["dataset_organism"] != "homo_sapiens": raise ValueError( f"Geneformer can only be used with human data " - f"(dataset_organism == \"{adata.uns['dataset_organism']}\")" + f"(dataset_organism == '{adata.uns['dataset_organism']}')" ) is_ensembl = all(var_name.startswith("ENSG") for var_name in adata.var_names) if not is_ensembl: raise ValueError(f"Geneformer requires adata.var_names to contain ENSEMBL gene ids") -print("Creating working directory", flush=True) -work_dir = TemporaryDirectory() -input_dir = os.path.join(work_dir.name, "input") -os.makedirs(input_dir) -tokenized_dir = os.path.join(work_dir.name, "tokenized") -os.makedirs(tokenized_dir) -embedding_dir = os.path.join(work_dir.name, "embedding") -os.makedirs(embedding_dir) -print(f"Working directory: {work_dir.name}", flush=True) +print(f">>> Getting settings for model '{par['model']}'...", flush=True) +model_split = par["model"].split("-") +model_details = { + "layers": model_split[1], + "dataset": model_split[2], + "input_size": int(model_split[3][1:]) +} +print(model_details, flush = True) -print("Preparing data", flush=True) -adata.var["ensembl_id"] = adata.var_names -adata.obs["n_counts"] = np.ravel(adata.X.sum(axis=1)) -adata.write_h5ad(os.path.join(input_dir, "input.h5ad")) -print(adata) +print(">>> Getting model dictionary files...", flush=True) +if model_details["dataset"] == "95M": + dictionaries_subfolder = "geneformer" +elif model_details["dataset"] == "30M": + dictionaries_subfolder = "geneformer/gene_dictionaries_30m" +else: + raise ValueError(f"Invalid model dataset: {model_details['dataset']}") +print(f"Dictionaries subfolder: '{dictionaries_subfolder}'") -print("Getting dictionary files", flush=True) -# Mapping files for the 30M model dictionary_files = { "ensembl_mapping": hf_hub_download( repo_id="ctheodoris/Geneformer", - subfolder="geneformer/gene_dictionaries_30m", - filename="ensembl_mapping_dict_gc30M.pkl", + subfolder=dictionaries_subfolder, + filename=f"ensembl_mapping_dict_gc{model_details['dataset']}.pkl", ), "gene_median": hf_hub_download( repo_id="ctheodoris/Geneformer", - subfolder="geneformer/gene_dictionaries_30m", - filename="gene_median_dictionary_gc30M.pkl", + subfolder=dictionaries_subfolder, + filename=f"gene_median_dictionary_gc{model_details['dataset']}.pkl", ), "gene_name_id": hf_hub_download( repo_id="ctheodoris/Geneformer", - subfolder="geneformer/gene_dictionaries_30m", - filename="gene_name_id_dict_gc30M.pkl", + subfolder=dictionaries_subfolder, + filename=f"gene_name_id_dict_gc{model_details['dataset']}.pkl", ), "token": hf_hub_download( repo_id="ctheodoris/Geneformer", - subfolder="geneformer/gene_dictionaries_30m", - filename="token_dictionary_gc30M.pkl", + subfolder=dictionaries_subfolder, + filename=f"token_dictionary_gc{model_details['dataset']}.pkl", ), } -print("Tokenizing data", flush=True) -# Set parameters for the 30M model -model_input_size = 2048 -special_token = False +print(">>> Creating working directory...", flush=True) +work_dir = TemporaryDirectory() +input_dir = os.path.join(work_dir.name, "input") +os.makedirs(input_dir) +tokenized_dir = os.path.join(work_dir.name, "tokenized") +os.makedirs(tokenized_dir) +embedding_dir = os.path.join(work_dir.name, "embedding") +os.makedirs(embedding_dir) +print(f"Working directory: '{work_dir.name}'", flush=True) + +print(">>> Preparing data...", flush=True) +adata.var["ensembl_id"] = adata.var_names +adata.obs["n_counts"] = np.ravel(adata.X.sum(axis=1)) +adata.write_h5ad(os.path.join(input_dir, "input.h5ad")) +print(adata) + +print(">>> Tokenizing data...", flush=True) +special_token = model_details['dataset'] == "95M" +print(f"Input size: {model_details['input_size']}, Special token: {special_token}") tokenizer = TranscriptomeTokenizer( nproc=n_processors, - model_input_size=model_input_size, + model_input_size=model_details["input_size"], special_token=special_token, gene_median_file=dictionary_files["gene_median"], token_dictionary_file=dictionary_files["token"], gene_mapping_file=dictionary_files["ensembl_mapping"], ) - tokenizer.tokenize_data(input_dir, tokenized_dir, "tokenized", file_format="h5ad") -print("Getting model files", flush=True) +print(f">>> Getting model files for model '{par['model']}'...", flush=True) model_files = { "model": hf_hub_download( repo_id="ctheodoris/Geneformer", - subfolder="gf-6L-30M-i2048", + subfolder=par["model"], filename="model.safetensors", ), "config": hf_hub_download( repo_id="ctheodoris/Geneformer", - subfolder="gf-6L-30M-i2048", + subfolder=par["model"], filename="config.json", ), } model_dir = os.path.dirname(model_files["model"]) -print("Extracting embeddings", flush=True) +print(">>> Extracting embeddings...", flush=True) embedder = EmbExtractor( emb_mode="cell", max_ncells=None, token_dictionary_file=dictionary_files["token"] ) @@ -118,7 +133,7 @@ ) embedding = pd.read_csv(os.path.join(embedding_dir, "embedding.csv")).to_numpy() -print("Store outputs", flush=True) +print(">>> Storing outputs...", flush=True) output = ad.AnnData( obs=adata.obs[[]], var=adata.var[[]], @@ -131,6 +146,8 @@ "method_id": meta["name"], }, ) +print(output) -print("Write output AnnData to file", flush=True) +print(">>> Writing output AnnData to file...", flush=True) output.write_h5ad(par["output"], compression="gzip") +print(">>> Done!") From f26eb248a81bbedefdf39e8e198dc4d88c320b8c Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Tue, 29 Oct 2024 11:00:37 +0100 Subject: [PATCH 22/23] Style Geneformer script --- src/methods/geneformer/script.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/methods/geneformer/script.py b/src/methods/geneformer/script.py index 6c9a373e..eeab4332 100644 --- a/src/methods/geneformer/script.py +++ b/src/methods/geneformer/script.py @@ -1,11 +1,12 @@ -import anndata as ad -from geneformer import TranscriptomeTokenizer, EmbExtractor import os import sys from tempfile import TemporaryDirectory -from huggingface_hub import hf_hub_download + +import anndata as ad import numpy as np import pandas as pd +from geneformer import EmbExtractor, TranscriptomeTokenizer +from huggingface_hub import hf_hub_download ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes @@ -13,7 +14,7 @@ par = { "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", "output": "output.h5ad", - "model": "gf-12L-95M-i4096" + "model": "gf-12L-95M-i4096", } meta = {"name": "geneformer"} ## VIASH END @@ -41,9 +42,9 @@ model_details = { "layers": model_split[1], "dataset": model_split[2], - "input_size": int(model_split[3][1:]) + "input_size": int(model_split[3][1:]), } -print(model_details, flush = True) +print(model_details, flush=True) print(">>> Getting model dictionary files...", flush=True) if model_details["dataset"] == "95M": @@ -94,7 +95,7 @@ print(adata) print(">>> Tokenizing data...", flush=True) -special_token = model_details['dataset'] == "95M" +special_token = model_details["dataset"] == "95M" print(f"Input size: {model_details['input_size']}, Special token: {special_token}") tokenizer = TranscriptomeTokenizer( nproc=n_processors, From 98789b13ed933e28a57557b5a1ed7081390a5579 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 30 Oct 2024 09:40:33 +0100 Subject: [PATCH 23/23] Make Geneformer inherit from base_method for tests --- src/methods/geneformer/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/methods/geneformer/config.vsh.yaml b/src/methods/geneformer/config.vsh.yaml index 832614a4..d571a4ad 100644 --- a/src/methods/geneformer/config.vsh.yaml +++ b/src/methods/geneformer/config.vsh.yaml @@ -1,4 +1,4 @@ -__merge__: /src/api/comp_method.yaml +__merge__: /src/api/base_method.yaml name: geneformer label: Geneformer