From f47eff90560a78402ebbc3ce00c857051fe2cfa1 Mon Sep 17 00:00:00 2001
From: Luke Zappia <lazappi@users.noreply.github.com>
Date: Wed, 2 Oct 2024 11:31:29 +0200
Subject: [PATCH 01/23] Add cxg_immune_cell_atlas as a test resource

---
 _viash.yaml                                | 11 +++++----
 scripts/create_resources/test_resources.sh | 26 ++++++++++++++++++++++
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/_viash.yaml b/_viash.yaml
index 8a0d18ea..d7ef3700 100644
--- a/_viash.yaml
+++ b/_viash.yaml
@@ -31,21 +31,24 @@ description: |
 
 references:
   doi:
-    # Luecken, M.D., Büttner, M., Chaichoompu, K. et al. 
-    # Benchmarking atlas-level data integration in single-cell genomics. Nat Methods 19, 41–50 (2022). 
+    # Luecken, M.D., Büttner, M., Chaichoompu, K. et al.
+    # Benchmarking atlas-level data integration in single-cell genomics. Nat Methods 19, 41–50 (2022).
     - 10.1038/s41592-021-01336-8
-  
+
 info:
   image: thumbnail.svg
   test_resources:
     - type: s3
       path: s3://openproblems-data/resources_test/common/cxg_mouse_pancreas_atlas/
       dest: resources_test/common/cxg_mouse_pancreas_atlas
+    - type: s3
+      path: s3://openproblems-data/resources_test/common/cxg_immune_cell_atlas/
+      dest: resources_test/common/cxg_immune_cell_atlas
     - type: s3
       path: s3://openproblems-data/resources_test/task_batch_integration/
       dest: resources_test/task_batch_integration
 
-authors: 
+authors:
   - name: Michaela Mueller
     roles: [ maintainer, author ]
     info:
diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh
index 92694692..f5369253 100755
--- a/scripts/create_resources/test_resources.sh
+++ b/scripts/create_resources/test_resources.sh
@@ -19,11 +19,20 @@ viash run src/data_processors/process_dataset/config.vsh.yaml -- \
   --output_dataset "$DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad" \
   --output_solution "$DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad"
 
+viash run src/data_processors/process_dataset/config.vsh.yaml -- \
+  --input "$RAW_DATA/cxg_immune_cell_atlas/dataset.h5ad" \
+  --output_dataset "$DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad" \
+  --output_solution "$DATASET_DIR/cxg_immune_cell_atlas/solution.h5ad"
+
 # run one method
 viash run src/methods/combat/config.vsh.yaml -- \
   --input $DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad \
   --output $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated.h5ad
 
+viash run src/methods/combat/config.vsh.yaml -- \
+  --input $DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad \
+  --output $DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad
+
 # run transformer
 viash run src/data_processors/transform/config.vsh.yaml -- \
     --input_integrated $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated.h5ad \
@@ -31,12 +40,23 @@ viash run src/data_processors/transform/config.vsh.yaml -- \
     --expected_method_types feature \
     --output $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated_full.h5ad
 
+viash run src/data_processors/transform/config.vsh.yaml -- \
+    --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad \
+    --input_dataset $DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad \
+    --expected_method_types feature \
+    --output $DATASET_DIR/cxg_immune_cell_atlas/integrated_full.h5ad
+
 # run one metric
 viash run src/metrics/graph_connectivity/config.vsh.yaml -- \
     --input_integrated $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated_full.h5ad \
     --input_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad \
     --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad
 
+viash run src/metrics/graph_connectivity/config.vsh.yaml -- \
+    --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated_full.h5ad \
+    --input_solution $DATASET_DIR/cxg_immune_cell_atlas/solution.h5ad \
+    --output $DATASET_DIR/cxg_immune_cell_atlas/score.h5ad
+
 # write the state file
 cat > $DATASET_DIR/state.yaml << HERE
 id: cxg_mouse_pancreas_atlas
@@ -45,6 +65,12 @@ output_solution: !file solution.h5ad
 output_integrated: !file integrated.h5ad
 output_integrated_full: !file integrated_full.h5ad
 output_score: !file score.h5ad
+id: cxg_immune_cell_atlas
+output_dataset: !file dataset_mod1.h5ad
+output_solution: !file solution_mod1.h5ad
+output_integrated: !file integrated_mod1.h5ad
+output_integrated_full: !file integrated_full_mod1.h5ad
+output_score: !file score_mod1.h5ad
 HERE
 
 # only run this if you have access to the openproblems-data bucket

From 175a3f82d38e58e3bc7a0e82c744b32e6fad8ab8 Mon Sep 17 00:00:00 2001
From: Luke Zappia <lazappi@users.noreply.github.com>
Date: Wed, 2 Oct 2024 11:32:02 +0200
Subject: [PATCH 02/23] Add SCimilarity component

---
 src/methods/scimilarity/config.vsh.yaml | 41 ++++++++++++
 src/methods/scimilarity/script.py       | 84 +++++++++++++++++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 src/methods/scimilarity/config.vsh.yaml
 create mode 100644 src/methods/scimilarity/script.py

diff --git a/src/methods/scimilarity/config.vsh.yaml b/src/methods/scimilarity/config.vsh.yaml
new file mode 100644
index 00000000..88258dc6
--- /dev/null
+++ b/src/methods/scimilarity/config.vsh.yaml
@@ -0,0 +1,41 @@
+__merge__: /src/api/comp_method.yaml
+name: scimilarity
+label: SCimilarity
+summary: SCimilarity provides unifying representation of single cell expression profiles
+description: |
+  SCimilarity is a unifying representation of single cell expression profiles that quantifies similarity between expression states and generalizes to represent new studies without additional training
+references:
+  doi: 10.1101/2023.07.18.549537
+links:
+  repository: https://github.com/Genentech/scimilarity
+  documentation: https://genentech.github.io/scimilarity/index.html
+info:
+  method_types: [embedding]
+  preferred_normalization: counts
+arguments:
+  - name: --model
+    type: file
+    description: Path to the directory containing SCimilarity models
+    required: true
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+test_resources:
+  - type: python_script
+    path: /common/component_tests/check_config.py
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
+  - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas
+    dest: resources_test/task_batch_integration/cxg_immune_cell_atlas
+engines:
+  - type: docker
+    image: openproblems/base_pytorch_nvidia:1.0.0
+    setup:
+      - type: python
+        github: Genentech/scimilarity
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, lowcpu]
diff --git a/src/methods/scimilarity/script.py b/src/methods/scimilarity/script.py
new file mode 100644
index 00000000..050700e6
--- /dev/null
+++ b/src/methods/scimilarity/script.py
@@ -0,0 +1,84 @@
+import sys
+import anndata as ad
+import scimilarity
+
+## VIASH START
+par = {
+    "input": "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad",
+    "output": "output.h5ad",
+    "model": "model_v1.1",
+}
+meta = {
+    "name": "scvi",
+}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+print("Read input", flush=True)
+adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns")
+
+if adata.uns["dataset_organism"] != "homo_sapiens":
+    raise ValueError(
+        f"SCimilarity can only be used with human data "
+        f"(dataset_organism == \"{adata.uns['dataset_organism']}\")"
+    )
+
+print("Load SCimilarity model", flush=True)
+scimilarity_embedding = scimilarity.cell_embedding.CellEmbedding(
+    model_path=par["model"]
+)
+print("SCimilarity version:", scimilarity.__version__)
+
+print("Create input data", flush=True)
+# Some of the functions modify the adata so make sure we have a copy
+input = ad.AnnData(X=adata.X.copy(), layers={"counts": adata.X.copy()})
+# Set input.var_names to gene symbols
+input.var_names = adata.var["feature_name"]
+
+print("Align datasets", flush=True)
+
+# Check the number of genes in the dataset and reduce the overlap threshold if
+# necessary (mostly for subsampled test datasets)
+gene_overlap_threshold = 5000
+if 0.8 * input.n_vars < gene_overlap_threshold:
+    from warnings import warn
+
+    warn(
+        f"The number of genes in the dataset ({input.n_vars}) "
+        f"is less than or close to {gene_overlap_threshold}. "
+        f"Setting gene_overlap_threshold to 0.8 * n_var ({int(0.8 * input.n_vars)})."
+    )
+    gene_overlap_threshold = int(0.8 * input.n_vars)
+
+input = scimilarity.utils.align_dataset(
+    input,
+    scimilarity_embedding.gene_order,
+    gene_overlap_threshold=gene_overlap_threshold,
+)
+input = scimilarity.utils.consolidate_duplicate_symbols(input)
+
+print("Normalizing dataset", flush=True)
+input = scimilarity.utils.lognorm_counts(input)
+
+print("Get cell embeddings", flush=True)
+cell_embeddings = scimilarity_embedding.get_embeddings(input.X)
+
+print("Store outputs", flush=True)
+output = ad.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    obsm={
+        "X_emb": cell_embeddings,
+    },
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "normalization_id": adata.uns["normalization_id"],
+        "method_id": meta["name"],
+    },
+)
+print(output)
+
+print("Write output to file", flush=True)
+output.write_h5ad(par["output"], compression="gzip")

From a9931e197bee0b33ceca038a9b4c46e2b71be4c4 Mon Sep 17 00:00:00 2001
From: Luke Zappia <lazappi@users.noreply.github.com>
Date: Wed, 2 Oct 2024 11:32:53 +0200
Subject: [PATCH 03/23] Add SCimiliarity to benchmark workflow

---
 src/workflows/run_benchmark/main.nf | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index ff77ad8d..a68002c6 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -26,6 +26,7 @@ methods = [
   scalex,
   scanorama,
   scanvi,
+  scimilarity,
   scvi
 ]
 
@@ -55,7 +56,7 @@ workflow run_wf {
    ****************************/
   dataset_ch = input_ch
     // store join id
-    | map{ id, state -> 
+    | map{ id, state ->
       [id, state + ["_meta": [join_id: id]]]
     }
 
@@ -153,7 +154,7 @@ workflow run_wf {
       },
       // use 'fromState' to fetch the arguments the component requires from the overall state
       fromState: [
-        input_solution: "input_solution", 
+        input_solution: "input_solution",
         input_integrated: "method_output_cleaned"
       ],
       // use 'toState' to publish that component's outputs to the overall state

From 84394c37a4c0cad696a5c338dc0e393db454ec56 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 8 Oct 2024 14:49:29 +0200
Subject: [PATCH 04/23] Update script to extract model

---
 src/methods/scimilarity/config.vsh.yaml |  2 +-
 src/methods/scimilarity/script.py       | 29 ++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/methods/scimilarity/config.vsh.yaml b/src/methods/scimilarity/config.vsh.yaml
index 88258dc6..9c429a6e 100644
--- a/src/methods/scimilarity/config.vsh.yaml
+++ b/src/methods/scimilarity/config.vsh.yaml
@@ -15,7 +15,7 @@ info:
 arguments:
   - name: --model
     type: file
-    description: Path to the directory containing SCimilarity models
+    description: Path to the directory containing SCimilarity models or a .zip/.tar.gz archive
     required: true
 resources:
   - type: python_script
diff --git a/src/methods/scimilarity/script.py b/src/methods/scimilarity/script.py
index 050700e6..471464d5 100644
--- a/src/methods/scimilarity/script.py
+++ b/src/methods/scimilarity/script.py
@@ -1,6 +1,10 @@
 import sys
 import anndata as ad
 import scimilarity
+import os
+import zipfile
+import tempfile
+import tarfile
 
 ## VIASH START
 par = {
@@ -25,9 +29,28 @@
         f"(dataset_organism == \"{adata.uns['dataset_organism']}\")"
     )
 
+if os.path.isdir(par["model"]):
+    model_temp = None
+    model_dir = par["model"]
+else:
+    model_temp = tempfile.TemporaryDirectory()
+    model_dir = model_temp.name
+
+    if zipfile.is_zipfile(par["model"]):
+        print("Extract SCimilarity model from .zip", flush=True)
+        with zipfile.ZipFile(par["model"], 'r') as zip_file:
+            zip_file.extractall(model_dir)
+    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith('.tar.gz'):
+        print("Extract SCimilarity model from .tar.gz", flush=True)
+        with tarfile.open(par["model"], 'r:gz') as tar_file:
+            tar_file.extractall(model_dir)
+            model_dir = os.path.join(model_dir, os.listdir(model_dir)[0])
+    else:
+        raise ValueError(f"The 'model' argument should be a directory a .zip file or a .tar.gz file")
+
 print("Load SCimilarity model", flush=True)
 scimilarity_embedding = scimilarity.cell_embedding.CellEmbedding(
-    model_path=par["model"]
+    model_path=model_dir
 )
 print("SCimilarity version:", scimilarity.__version__)
 
@@ -82,3 +105,7 @@
 
 print("Write output to file", flush=True)
 output.write_h5ad(par["output"], compression="gzip")
+
+if model_temp is not None:
+    print("Cleanup model directory", flush=True)
+    model_temp.cleanup()

From 5e0038c8477e074fff87d2e9cc452e34831ed757 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 8 Oct 2024 14:55:05 +0200
Subject: [PATCH 05/23] Add SCimilarity model path to benchmark workflow

---
 src/workflows/run_benchmark/main.nf | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index a68002c6..e6743d18 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -26,7 +26,9 @@ methods = [
   scalex,
   scanorama,
   scanvi,
-  scimilarity,
+  scimilarity.run(
+    args: [model: file("https://zenodo.org/records/10685499/files/model_v1.1.tar.gz")]
+  ),
   scvi
 ]
 

From c927cb1aaa2e707b6fb9527f71ac2ac638cc559f Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 8 Oct 2024 15:01:30 +0200
Subject: [PATCH 06/23] Add base_method API to disable tests for SCimilarity

---
 src/api/comp_method.yaml                | 20 +-------------------
 src/methods/scimilarity/config.vsh.yaml |  2 +-
 2 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml
index dda52ce0..c8480836 100644
--- a/src/api/comp_method.yaml
+++ b/src/api/comp_method.yaml
@@ -1,23 +1,5 @@
-namespace: methods
-info:
-  type: method
-  type_info:
-    label: Method
-    summary: A method for the batch integration task.
-    description: |
-      A batch integration method which integrates multiple datasets.
-arguments:
-  - name: --input
-    __merge__: file_dataset.yaml
-    direction: input
-    required: true
-  - name: --output
-    __merge__: file_integrated.yaml
-    direction: output
-    required: true
+__merge__: base_method.yaml
 test_resources:
-  - type: python_script
-    path: /common/component_tests/check_config.py
   - type: python_script
     path: /common/component_tests/run_and_check_output.py
   - path: /resources_test/task_batch_integration/cxg_mouse_pancreas_atlas
diff --git a/src/methods/scimilarity/config.vsh.yaml b/src/methods/scimilarity/config.vsh.yaml
index 9c429a6e..ae66ff58 100644
--- a/src/methods/scimilarity/config.vsh.yaml
+++ b/src/methods/scimilarity/config.vsh.yaml
@@ -1,4 +1,4 @@
-__merge__: /src/api/comp_method.yaml
+__merge__: /src/api/base_method.yaml
 name: scimilarity
 label: SCimilarity
 summary: SCimilarity provides unifying representation of single cell expression profiles

From 5c74f375e6dc11082e6965918831665645dad963 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 8 Oct 2024 15:10:35 +0200
Subject: [PATCH 07/23] Replace cxg_mouse_pancreas_atlas with
 cxg_immune_cell_atlas

---
 README.md                                     | 11 ++++----
 _viash.yaml                                   |  3 --
 scripts/create_resources/test_resources.sh    | 28 +------------------
 src/api/base_method.yaml                      | 20 +++++++++++++
 src/api/comp_control_method.yaml              |  4 +--
 src/api/comp_method.yaml                      |  4 +--
 src/api/comp_metric.yaml                      |  4 +--
 src/api/comp_process_dataset.yaml             |  6 ++--
 src/api/comp_transformer.yaml                 |  8 +++---
 src/api/file_common_dataset.yaml              |  2 +-
 src/api/file_dataset.yaml                     |  2 +-
 src/api/file_integrated.yaml                  |  2 +-
 src/api/file_integrated_full.yaml             |  4 +--
 src/api/file_solution.yaml                    |  2 +-
 .../embed_cell_types/script.py                |  6 ++--
 .../embed_cell_types_jittered/script.py       |  6 ++--
 src/control_methods/no_integration/script.py  |  2 +-
 .../no_integration_batch/script.py            |  6 ++--
 .../shuffle_integration/script.py             |  4 +--
 .../shuffle_integration_by_batch/script.py    |  4 +--
 .../script.py                                 |  4 +--
 src/data_processors/transform/script.py       |  6 ++--
 src/methods/batchelor_fastmnn/script.R        |  2 +-
 src/methods/batchelor_mnn_correct/script.R    |  2 +-
 src/methods/bbknn/script.py                   |  2 +-
 src/methods/combat/script.py                  |  2 +-
 src/methods/harmony/script.R                  |  2 +-
 src/methods/harmonypy/script.py               |  2 +-
 src/methods/liger/script.R                    |  2 +-
 src/methods/mnnpy/script.py                   |  2 +-
 src/methods/pyliger/script.py                 |  4 +--
 src/methods/scalex/script.py                  |  2 +-
 src/methods/scanorama/script.py               |  2 +-
 src/methods/scanvi/script.py                  |  2 +-
 src/methods/scimilarity/script.py             |  2 +-
 src/methods/scvi/script.py                    |  2 +-
 src/metrics/asw_batch/script.py               |  2 +-
 src/metrics/asw_label/script.py               |  2 +-
 src/metrics/cell_cycle_conservation/script.py |  2 +-
 src/metrics/clustering_overlap/script.py      |  4 +--
 src/metrics/graph_connectivity/script.py      |  2 +-
 src/metrics/hvg_overlap/script.py             |  4 +--
 src/metrics/isolated_label_asw/script.py      |  4 +--
 src/metrics/isolated_label_f1/script.py       |  4 +--
 src/metrics/kbet/script.py                    |  4 +--
 src/metrics/lisi/script.py                    |  2 +-
 src/metrics/pcr/script.py                     |  4 +--
 47 files changed, 96 insertions(+), 106 deletions(-)
 create mode 100644 src/api/base_method.yaml

diff --git a/README.md b/README.md
index 3a460e2b..50c16e34 100644
--- a/README.md
+++ b/README.md
@@ -91,8 +91,7 @@ flowchart TB
 
 A subset of the common dataset.
 
-Example file:
-`resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad`
+Example file: `resources_test/common/cxg_immune_cell_atlas/dataset.h5ad`
 
 Format:
 
@@ -158,7 +157,7 @@ Arguments:
 Unintegrated AnnData HDF5 file.
 
 Example file:
-`resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad`
+`resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad`
 
 Format:
 
@@ -202,7 +201,7 @@ Data structure:
 Uncensored dataset containing the true labels.
 
 Example file:
-`resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad`
+`resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad`
 
 Format:
 
@@ -317,7 +316,7 @@ Arguments:
 An integrated AnnData dataset.
 
 Example file:
-`resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated.h5ad`
+`resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated.h5ad`
 
 Description:
 
@@ -362,7 +361,7 @@ Data structure:
 An integrated AnnData dataset with additional outputs.
 
 Example file:
-`resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad`
+`resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad`
 
 Description:
 
diff --git a/_viash.yaml b/_viash.yaml
index d7ef3700..1598a220 100644
--- a/_viash.yaml
+++ b/_viash.yaml
@@ -38,9 +38,6 @@ references:
 info:
   image: thumbnail.svg
   test_resources:
-    - type: s3
-      path: s3://openproblems-data/resources_test/common/cxg_mouse_pancreas_atlas/
-      dest: resources_test/common/cxg_mouse_pancreas_atlas
     - type: s3
       path: s3://openproblems-data/resources_test/common/cxg_immune_cell_atlas/
       dest: resources_test/common/cxg_immune_cell_atlas
diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh
index f5369253..8f458ccb 100755
--- a/scripts/create_resources/test_resources.sh
+++ b/scripts/create_resources/test_resources.sh
@@ -14,32 +14,17 @@ DATASET_DIR=resources_test/task_batch_integration
 mkdir -p $DATASET_DIR
 
 # process dataset
-viash run src/data_processors/process_dataset/config.vsh.yaml -- \
-  --input "$RAW_DATA/cxg_mouse_pancreas_atlas/dataset.h5ad" \
-  --output_dataset "$DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad" \
-  --output_solution "$DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad"
-
 viash run src/data_processors/process_dataset/config.vsh.yaml -- \
   --input "$RAW_DATA/cxg_immune_cell_atlas/dataset.h5ad" \
   --output_dataset "$DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad" \
   --output_solution "$DATASET_DIR/cxg_immune_cell_atlas/solution.h5ad"
 
 # run one method
-viash run src/methods/combat/config.vsh.yaml -- \
-  --input $DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad \
-  --output $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated.h5ad
-
 viash run src/methods/combat/config.vsh.yaml -- \
   --input $DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad \
   --output $DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad
 
 # run transformer
-viash run src/data_processors/transform/config.vsh.yaml -- \
-    --input_integrated $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated.h5ad \
-    --input_dataset $DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad \
-    --expected_method_types feature \
-    --output $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated_full.h5ad
-
 viash run src/data_processors/transform/config.vsh.yaml -- \
     --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad \
     --input_dataset $DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad \
@@ -47,24 +32,13 @@ viash run src/data_processors/transform/config.vsh.yaml -- \
     --output $DATASET_DIR/cxg_immune_cell_atlas/integrated_full.h5ad
 
 # run one metric
-viash run src/metrics/graph_connectivity/config.vsh.yaml -- \
-    --input_integrated $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated_full.h5ad \
-    --input_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad \
-    --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad
-
 viash run src/metrics/graph_connectivity/config.vsh.yaml -- \
     --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated_full.h5ad \
     --input_solution $DATASET_DIR/cxg_immune_cell_atlas/solution.h5ad \
     --output $DATASET_DIR/cxg_immune_cell_atlas/score.h5ad
 
 # write the state file
-cat > $DATASET_DIR/state.yaml << HERE
-id: cxg_mouse_pancreas_atlas
-output_dataset: !file dataset.h5ad
-output_solution: !file solution.h5ad
-output_integrated: !file integrated.h5ad
-output_integrated_full: !file integrated_full.h5ad
-output_score: !file score.h5ad
+cat > $DATASET_DIR/cxg_immune_cell_atlas/state.yaml << HERE
 id: cxg_immune_cell_atlas
 output_dataset: !file dataset_mod1.h5ad
 output_solution: !file solution_mod1.h5ad
diff --git a/src/api/base_method.yaml b/src/api/base_method.yaml
new file mode 100644
index 00000000..ed3d5938
--- /dev/null
+++ b/src/api/base_method.yaml
@@ -0,0 +1,20 @@
+namespace: methods
+info:
+  type: method
+  type_info:
+    label: Method
+    summary: A method for the batch integration task.
+    description: |
+      A batch integration method which integrates multiple datasets.
+arguments:
+  - name: --input
+    __merge__: file_dataset.yaml
+    direction: input
+    required: true
+  - name: --output
+    __merge__: file_integrated.yaml
+    direction: output
+    required: true
+test_resources:
+  - type: python_script
+    path: /common/component_tests/check_config.py
diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml
index 0ca176f6..b8e1ebd3 100644
--- a/src/api/comp_control_method.yaml
+++ b/src/api/comp_control_method.yaml
@@ -24,5 +24,5 @@ test_resources:
     path: /common/component_tests/check_config.py
   - type: python_script
     path: /common/component_tests/run_and_check_output.py
-  - path: /resources_test/task_batch_integration/cxg_mouse_pancreas_atlas
-    dest: resources_test/task_batch_integration/cxg_mouse_pancreas_atlas
+  - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas
+    dest: resources_test/task_batch_integration/cxg_immune_cell_atlas
diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml
index c8480836..571c9565 100644
--- a/src/api/comp_method.yaml
+++ b/src/api/comp_method.yaml
@@ -2,5 +2,5 @@ __merge__: base_method.yaml
 test_resources:
   - type: python_script
     path: /common/component_tests/run_and_check_output.py
-  - path: /resources_test/task_batch_integration/cxg_mouse_pancreas_atlas
-    dest: resources_test/task_batch_integration/cxg_mouse_pancreas_atlas
+  - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas
+    dest: resources_test/task_batch_integration/cxg_immune_cell_atlas
diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml
index 73eee377..bc57056a 100644
--- a/src/api/comp_metric.yaml
+++ b/src/api/comp_metric.yaml
@@ -24,5 +24,5 @@ test_resources:
     path: /common/component_tests/check_config.py
   - type: python_script
     path: /common/component_tests/run_and_check_output.py
-  - path: /resources_test/task_batch_integration/cxg_mouse_pancreas_atlas
-    dest: resources_test/task_batch_integration/cxg_mouse_pancreas_atlas
+  - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas
+    dest: resources_test/task_batch_integration/cxg_immune_cell_atlas
diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_process_dataset.yaml
index b2b449aa..067a5c3d 100644
--- a/src/api/comp_process_dataset.yaml
+++ b/src/api/comp_process_dataset.yaml
@@ -25,7 +25,7 @@ arguments:
     default: 2000
     required: false
 test_resources:
-  - path: /resources_test/common/cxg_mouse_pancreas_atlas/
-    dest: resources_test/common/cxg_mouse_pancreas_atlas/
+  - path: /resources_test/common/cxg_immune_cell_atlas/
+    dest: resources_test/common/cxg_immune_cell_atlas/
   - type: python_script
-    path: /common/component_tests/run_and_check_output.py
\ No newline at end of file
+    path: /common/component_tests/run_and_check_output.py
diff --git a/src/api/comp_transformer.yaml b/src/api/comp_transformer.yaml
index eb347298..b68a9c37 100644
--- a/src/api/comp_transformer.yaml
+++ b/src/api/comp_transformer.yaml
@@ -6,7 +6,7 @@ info:
     summary: Check the output and transform to create additional output types
     description: |
       This component will:
-      
+
         - Assert whether the input dataset and integrated dataset have the same shape.
         - Reorder the integrated dataset to match the input dataset if needed.
         - Transform the corrected feature output to an embedding.
@@ -26,7 +26,7 @@ arguments:
     required: true
     multiple: true
     description: |
-      The expected output types of the batch integration method. 
+      The expected output types of the batch integration method.
     choices: [ feature, embedding, graph ]
   - name: --output
     __merge__: file_integrated_full.yaml
@@ -35,5 +35,5 @@ arguments:
 test_resources:
   - type: python_script
     path: /common/component_tests/run_and_check_output.py
-  - path: /resources_test/task_batch_integration/cxg_mouse_pancreas_atlas
-    dest: resources_test/task_batch_integration/cxg_mouse_pancreas_atlas
+  - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas
+    dest: resources_test/task_batch_integration/cxg_immune_cell_atlas
diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml
index 1399f0b2..171fdeb6 100644
--- a/src/api/file_common_dataset.yaml
+++ b/src/api/file_common_dataset.yaml
@@ -2,7 +2,7 @@
 # `src/datasets/api/file_common_dataset.yaml`. However, some fields
 # such as obs.cell_type and obs.batch are now required
 type: file
-example: "resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad"
+example: "resources_test/common/cxg_immune_cell_atlas/dataset.h5ad"
 label: "Common Dataset"
 summary: A subset of the common dataset.
 info:
diff --git a/src/api/file_dataset.yaml b/src/api/file_dataset.yaml
index 8f60192b..a76ae203 100644
--- a/src/api/file_dataset.yaml
+++ b/src/api/file_dataset.yaml
@@ -1,5 +1,5 @@
 type: file
-example: "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad"
+example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad"
 label: "Dataset"
 summary: Unintegrated AnnData HDF5 file.
 info:
diff --git a/src/api/file_integrated.yaml b/src/api/file_integrated.yaml
index abd6df29..7920fcd0 100644
--- a/src/api/file_integrated.yaml
+++ b/src/api/file_integrated.yaml
@@ -1,5 +1,5 @@
 type: file
-example: "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated.h5ad"
+example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated.h5ad"
 label: Integration
 summary: An integrated AnnData dataset.
 description: |
diff --git a/src/api/file_integrated_full.yaml b/src/api/file_integrated_full.yaml
index 4d02f596..cdedb854 100644
--- a/src/api/file_integrated_full.yaml
+++ b/src/api/file_integrated_full.yaml
@@ -1,5 +1,5 @@
 type: file
-example: "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad"
+example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad"
 label: Transformed integration
 summary: An integrated AnnData dataset with additional outputs.
 description: |
@@ -8,7 +8,7 @@ description: |
     - Feature: the corrected_counts layer
     - Embedding: the X_emb obsm
     - Graph: the connectivities and distances obsp
-  
+
   The Graph should always be present, but the Feature and Embedding are optional.
 info:
   format:
diff --git a/src/api/file_solution.yaml b/src/api/file_solution.yaml
index 35e0c7ea..562bfa22 100644
--- a/src/api/file_solution.yaml
+++ b/src/api/file_solution.yaml
@@ -1,5 +1,5 @@
 type: file
-example: "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad"
+example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad"
 label: "Solution"
 summary: Uncensored dataset containing the true labels.
 info:
diff --git a/src/control_methods/embed_cell_types/script.py b/src/control_methods/embed_cell_types/script.py
index 5482d301..f6f1961b 100644
--- a/src/control_methods/embed_cell_types/script.py
+++ b/src/control_methods/embed_cell_types/script.py
@@ -2,11 +2,11 @@
 
 ## VIASH START
 par = {
-    'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
-    'input_solution': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad',
+    'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
+    'input_solution': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad',
     'output': 'output.h5ad',
 }
-meta = { 
+meta = {
     'functionality': 'foo',
     'config': 'bar'
 }
diff --git a/src/control_methods/embed_cell_types_jittered/script.py b/src/control_methods/embed_cell_types_jittered/script.py
index 9ad3e743..06180464 100644
--- a/src/control_methods/embed_cell_types_jittered/script.py
+++ b/src/control_methods/embed_cell_types_jittered/script.py
@@ -4,13 +4,13 @@
 ## VIASH START
 
 par = {
-    'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
-    'input_solution': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad',
+    'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
+    'input_solution': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad',
     'output': 'output.h5ad',
     'jitter': 0.01,
 }
 
-meta = { 
+meta = {
     'functionality': 'foo',
     'config': 'bar'
 }
diff --git a/src/control_methods/no_integration/script.py b/src/control_methods/no_integration/script.py
index 0c1581be..df7b280d 100644
--- a/src/control_methods/no_integration/script.py
+++ b/src/control_methods/no_integration/script.py
@@ -2,7 +2,7 @@
 
 ## VIASH START
 par = {
-    'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+    'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
     'output': 'output.h5ad',
 }
 ## VIASH END
diff --git a/src/control_methods/no_integration_batch/script.py b/src/control_methods/no_integration_batch/script.py
index 8324acf9..1f62763c 100644
--- a/src/control_methods/no_integration_batch/script.py
+++ b/src/control_methods/no_integration_batch/script.py
@@ -5,11 +5,11 @@
 ## VIASH START
 
 par = {
-    'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+    'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
     'output': 'output.h5ad',
 }
 
-meta = { 
+meta = {
     'functionality': 'foo',
     'config': 'bar'
 }
@@ -46,4 +46,4 @@
 
 print("Store outputs", flush=True)
 adata.uns['method_id'] = meta['name']
-adata.write_h5ad(par['output'], compression='gzip')
\ No newline at end of file
+adata.write_h5ad(par['output'], compression='gzip')
diff --git a/src/control_methods/shuffle_integration/script.py b/src/control_methods/shuffle_integration/script.py
index 91a542af..e1f29318 100644
--- a/src/control_methods/shuffle_integration/script.py
+++ b/src/control_methods/shuffle_integration/script.py
@@ -3,10 +3,10 @@
 
 ## VIASH START
 par = {
-    'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+    'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
     'output': 'output.h5ad',
 }
-meta = { 
+meta = {
     "resources_dir": "src/tasks/batch_integration/control_methods/"
 }
 ## VIASH END
diff --git a/src/control_methods/shuffle_integration_by_batch/script.py b/src/control_methods/shuffle_integration_by_batch/script.py
index c7d35171..a9b63edc 100644
--- a/src/control_methods/shuffle_integration_by_batch/script.py
+++ b/src/control_methods/shuffle_integration_by_batch/script.py
@@ -3,10 +3,10 @@
 
 ## VIASH START
 par = {
-    'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+    'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
     'output': 'output.h5ad',
 }
-meta = { 
+meta = {
     "resources_dir": "src/tasks/batch_integration/control_methods/"
 }
 ## VIASH END
diff --git a/src/control_methods/shuffle_integration_by_cell_type/script.py b/src/control_methods/shuffle_integration_by_cell_type/script.py
index 762bd07b..0df2ba46 100644
--- a/src/control_methods/shuffle_integration_by_cell_type/script.py
+++ b/src/control_methods/shuffle_integration_by_cell_type/script.py
@@ -3,10 +3,10 @@
 
 ## VIASH START
 par = {
-    'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+    'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
     'output': 'output.h5ad',
 }
-meta = { 
+meta = {
     "resources_dir": "src/tasks/batch_integration/control_methods/"
 }
 ## VIASH END
diff --git a/src/data_processors/transform/script.py b/src/data_processors/transform/script.py
index dc01584a..226edca8 100644
--- a/src/data_processors/transform/script.py
+++ b/src/data_processors/transform/script.py
@@ -3,8 +3,8 @@
 
 ## VIASH START
 par = {
-    "input_integrated": "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated.h5ad",
-    "input_dataset": "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad",
+    "input_integrated": "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated.h5ad",
+    "input_dataset": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
     "expected_method_types": ["feature"],
     "ouput": "output.h5ad"
 }
@@ -28,7 +28,7 @@
 
 if "corrected_counts" in integrated.layers.keys():
     assert integrated.shape[1] == dataset.shape[1], "Number of genes do not match"
-    
+
     if not integrated.var.index.equals(dataset.var.index):
         assert integrated.var.index.sort_values().equals(dataset.var.index.sort_values()), "Gene names do not match"
         print("Reordering genes", flush=True)
diff --git a/src/methods/batchelor_fastmnn/script.R b/src/methods/batchelor_fastmnn/script.R
index 76791bea..879aad68 100644
--- a/src/methods/batchelor_fastmnn/script.R
+++ b/src/methods/batchelor_fastmnn/script.R
@@ -8,7 +8,7 @@ suppressPackageStartupMessages({
 
 ## VIASH START
 par <- list(
-  input = 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+  input = 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
   output = 'output.h5ad'
 )
 meta <- list(
diff --git a/src/methods/batchelor_mnn_correct/script.R b/src/methods/batchelor_mnn_correct/script.R
index cadbcc82..4a8802af 100644
--- a/src/methods/batchelor_mnn_correct/script.R
+++ b/src/methods/batchelor_mnn_correct/script.R
@@ -7,7 +7,7 @@ suppressPackageStartupMessages({
 })
 ## VIASH START
 par <- list(
-  input = 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+  input = 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
   output = 'output.h5ad'
 )
 meta <- list(
diff --git a/src/methods/bbknn/script.py b/src/methods/bbknn/script.py
index 86c807ed..9c121ccb 100644
--- a/src/methods/bbknn/script.py
+++ b/src/methods/bbknn/script.py
@@ -5,7 +5,7 @@
 
 ## VIASH START
 par = {
-    'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+    'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
     'output': 'output.h5ad',
     'annoy_n_trees': 10,
     'neighbors_within_batch': 3,
diff --git a/src/methods/combat/script.py b/src/methods/combat/script.py
index 155c1621..ab251363 100644
--- a/src/methods/combat/script.py
+++ b/src/methods/combat/script.py
@@ -4,7 +4,7 @@
 
 ## VIASH START
 par = {
-    'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+    'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
     'output': 'output.h5ad',
 }
 meta = {
diff --git a/src/methods/harmony/script.R b/src/methods/harmony/script.R
index e5cb2c5b..595e3f19 100644
--- a/src/methods/harmony/script.R
+++ b/src/methods/harmony/script.R
@@ -5,7 +5,7 @@ requireNamespace("harmony", quietly = TRUE)
 
 ## VIASH START
 par <- list(
-  input = 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+  input = 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
   output = 'output.h5ad'
 )
 meta <- list(
diff --git a/src/methods/harmonypy/script.py b/src/methods/harmonypy/script.py
index 79b32537..ec851953 100644
--- a/src/methods/harmonypy/script.py
+++ b/src/methods/harmonypy/script.py
@@ -5,7 +5,7 @@
 
 ## VIASH START
 par = {
-    "input": "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad",
+    "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
     "output": "output.h5ad"
 }
 meta = {
diff --git a/src/methods/liger/script.R b/src/methods/liger/script.R
index 62dec598..e5b7e451 100644
--- a/src/methods/liger/script.R
+++ b/src/methods/liger/script.R
@@ -4,7 +4,7 @@ requireNamespace("rliger", quietly = TRUE)
 
 ## VIASH START
 par <- list(
-  input = "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad",
+  input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
   output = "output.h5ad"
 )
 meta <- list(
diff --git a/src/methods/mnnpy/script.py b/src/methods/mnnpy/script.py
index a9dfd8a8..7100da10 100644
--- a/src/methods/mnnpy/script.py
+++ b/src/methods/mnnpy/script.py
@@ -3,7 +3,7 @@
 
 ## VIASH START
 par = {
-    'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+    'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
     'output': 'output.h5ad',
 }
 meta = {
diff --git a/src/methods/pyliger/script.py b/src/methods/pyliger/script.py
index 603b6d04..c6bd5f0e 100644
--- a/src/methods/pyliger/script.py
+++ b/src/methods/pyliger/script.py
@@ -5,7 +5,7 @@
 
 ## VIASH START
 par = {
-    'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+    'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
     'output': 'output.h5ad'
 }
 meta = {
@@ -31,7 +31,7 @@
 adata_per_batch = []
 for batch in adata.obs['batch'].unique():
   adb = adata[adata.obs['batch'] == batch].copy()
-  
+
   # save row sum and sum of squares for further use
   norm_sum = np.ravel(np.sum(adb.layers["norm_data"], axis=0))
   norm_sum_sq = np.ravel(np.sum(adb.layers["norm_data"].power(2), axis=0))
diff --git a/src/methods/scalex/script.py b/src/methods/scalex/script.py
index 887a989d..7d09f02f 100644
--- a/src/methods/scalex/script.py
+++ b/src/methods/scalex/script.py
@@ -4,7 +4,7 @@
 
 ## VIASH START
 par = {
-    'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+    'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
     'output': 'output.h5ad',
 }
 meta = {
diff --git a/src/methods/scanorama/script.py b/src/methods/scanorama/script.py
index 8f99418c..2ddb91df 100644
--- a/src/methods/scanorama/script.py
+++ b/src/methods/scanorama/script.py
@@ -4,7 +4,7 @@
 
 ## VIASH START
 par = {
-    'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+    'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
     'output': 'output.h5ad',
 }
 meta = {
diff --git a/src/methods/scanvi/script.py b/src/methods/scanvi/script.py
index 882d7ff6..5a17d2e9 100644
--- a/src/methods/scanvi/script.py
+++ b/src/methods/scanvi/script.py
@@ -4,7 +4,7 @@
 
 ## VIASH START
 par = {
-    'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+    'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
     'output': 'output.h5ad',
     'n_hvg': 2000,
     'n_latent': 30,
diff --git a/src/methods/scimilarity/script.py b/src/methods/scimilarity/script.py
index 471464d5..7117a9d6 100644
--- a/src/methods/scimilarity/script.py
+++ b/src/methods/scimilarity/script.py
@@ -8,7 +8,7 @@
 
 ## VIASH START
 par = {
-    "input": "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad",
+    "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
     "output": "output.h5ad",
     "model": "model_v1.1",
 }
diff --git a/src/methods/scvi/script.py b/src/methods/scvi/script.py
index b6836b49..20f1cf32 100644
--- a/src/methods/scvi/script.py
+++ b/src/methods/scvi/script.py
@@ -4,7 +4,7 @@
 
 ## VIASH START
 par = {
-    'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+    'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
     'output': 'output.h5ad',
     'n_hvg': 2000,
     'n_latent': 30,
diff --git a/src/metrics/asw_batch/script.py b/src/metrics/asw_batch/script.py
index d6dafcfe..4a7269da 100644
--- a/src/metrics/asw_batch/script.py
+++ b/src/metrics/asw_batch/script.py
@@ -4,7 +4,7 @@
 
 ## VIASH START
 par = {
-    'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad',
+    'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad',
     'output': 'output.h5ad',
 }
 meta = {
diff --git a/src/metrics/asw_label/script.py b/src/metrics/asw_label/script.py
index 499a06f9..e307aaac 100644
--- a/src/metrics/asw_label/script.py
+++ b/src/metrics/asw_label/script.py
@@ -4,7 +4,7 @@
 
 ## VIASH START
 par = {
-    'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad',
+    'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad',
     'output': 'output.h5ad',
 }
 
diff --git a/src/metrics/cell_cycle_conservation/script.py b/src/metrics/cell_cycle_conservation/script.py
index 9ad38422..b254f4f8 100644
--- a/src/metrics/cell_cycle_conservation/script.py
+++ b/src/metrics/cell_cycle_conservation/script.py
@@ -5,7 +5,7 @@
 
 ## VIASH START
 par = {
-    'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad',
+    'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad',
     'output': 'output.h5ad'
 }
 
diff --git a/src/metrics/clustering_overlap/script.py b/src/metrics/clustering_overlap/script.py
index 30fe1704..2254acb0 100644
--- a/src/metrics/clustering_overlap/script.py
+++ b/src/metrics/clustering_overlap/script.py
@@ -6,7 +6,7 @@
 
 ## VIASH START
 par = {
-    'adata_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad',
+    'adata_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad',
     'output': 'output.h5ad',
 }
 
@@ -50,4 +50,4 @@
 )
 
 print("Write data to file", flush=True)
-output.write_h5ad(par["output"], compression="gzip")
\ No newline at end of file
+output.write_h5ad(par["output"], compression="gzip")
diff --git a/src/metrics/graph_connectivity/script.py b/src/metrics/graph_connectivity/script.py
index 0c92a35a..6148884e 100644
--- a/src/metrics/graph_connectivity/script.py
+++ b/src/metrics/graph_connectivity/script.py
@@ -4,7 +4,7 @@
 
 ## VIASH START
 par = {
-    'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad',
+    'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad',
     'output': 'output.h5ad',
 }
 meta = {
diff --git a/src/metrics/hvg_overlap/script.py b/src/metrics/hvg_overlap/script.py
index 8ecda9bc..b902fe08 100644
--- a/src/metrics/hvg_overlap/script.py
+++ b/src/metrics/hvg_overlap/script.py
@@ -4,8 +4,8 @@
 
 ## VIASH START
 par = {
-    'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad',
-    'input_solution': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad',
+    'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad',
+    'input_solution': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad',
     'output': 'output.h5ad',
 }
 meta = {
diff --git a/src/metrics/isolated_label_asw/script.py b/src/metrics/isolated_label_asw/script.py
index 39d23568..602e8d16 100644
--- a/src/metrics/isolated_label_asw/script.py
+++ b/src/metrics/isolated_label_asw/script.py
@@ -4,7 +4,7 @@
 
 ## VIASH START
 par = {
-    'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad',
+    'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad',
     'output': 'output.h5ad',
 }
 
@@ -46,4 +46,4 @@
 )
 
 print('Write data to file', flush=True)
-output.write_h5ad(par['output'], compression='gzip')
\ No newline at end of file
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/metrics/isolated_label_f1/script.py b/src/metrics/isolated_label_f1/script.py
index a6529adb..2737f244 100644
--- a/src/metrics/isolated_label_f1/script.py
+++ b/src/metrics/isolated_label_f1/script.py
@@ -4,7 +4,7 @@
 
 ## VIASH START
 par = {
-    'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad',
+    'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad',
     'output': 'output.h5ad',
 }
 
@@ -45,4 +45,4 @@
 )
 
 print('Write data to file', flush=True)
-output.write_h5ad(par['output'], compression='gzip')
\ No newline at end of file
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/metrics/kbet/script.py b/src/metrics/kbet/script.py
index 6c74c261..89bd799e 100644
--- a/src/metrics/kbet/script.py
+++ b/src/metrics/kbet/script.py
@@ -4,7 +4,7 @@
 
 ## VIASH START
 par = {
-    'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad',
+    'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad',
     'output': 'output.h5ad',
 }
 
@@ -46,4 +46,4 @@
 )
 
 print('Write data to file', flush=True)
-output.write_h5ad(par['output'], compression='gzip')
\ No newline at end of file
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/metrics/lisi/script.py b/src/metrics/lisi/script.py
index b50f6e62..c0c564cd 100644
--- a/src/metrics/lisi/script.py
+++ b/src/metrics/lisi/script.py
@@ -5,7 +5,7 @@
 
 ## VIASH START
 par = {
-    'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad',
+    'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad',
     'output': 'output.h5ad',
 }
 meta = {
diff --git a/src/metrics/pcr/script.py b/src/metrics/pcr/script.py
index 265ad430..0ae18ddb 100644
--- a/src/metrics/pcr/script.py
+++ b/src/metrics/pcr/script.py
@@ -4,7 +4,7 @@
 
 ## VIASH START
 par = {
-    'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad',
+    'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad',
     'output': 'output.h5ad',
 }
 
@@ -59,4 +59,4 @@
 
 
 print('Write data to file', flush=True)
-output.write_h5ad(par['output'], compression='gzip')
\ No newline at end of file
+output.write_h5ad(par['output'], compression='gzip')

From 7b6fea33c1cf9970693c5ca8b57158ae77dd1b41 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 8 Oct 2024 15:16:11 +0200
Subject: [PATCH 08/23] Style SCimiliarity script

---
 src/methods/scimilarity/script.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/methods/scimilarity/script.py b/src/methods/scimilarity/script.py
index 7117a9d6..2da1790e 100644
--- a/src/methods/scimilarity/script.py
+++ b/src/methods/scimilarity/script.py
@@ -1,11 +1,12 @@
-import sys
-import anndata as ad
-import scimilarity
 import os
-import zipfile
+import sys
 import tempfile
+import zipfile
 import tarfile
 
+import anndata as ad
+import scimilarity
+
 ## VIASH START
 par = {
     "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
@@ -38,20 +39,20 @@
 
     if zipfile.is_zipfile(par["model"]):
         print("Extract SCimilarity model from .zip", flush=True)
-        with zipfile.ZipFile(par["model"], 'r') as zip_file:
+        with zipfile.ZipFile(par["model"], "r") as zip_file:
             zip_file.extractall(model_dir)
-    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith('.tar.gz'):
+    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"):
         print("Extract SCimilarity model from .tar.gz", flush=True)
-        with tarfile.open(par["model"], 'r:gz') as tar_file:
+        with tarfile.open(par["model"], "r:gz") as tar_file:
             tar_file.extractall(model_dir)
             model_dir = os.path.join(model_dir, os.listdir(model_dir)[0])
     else:
-        raise ValueError(f"The 'model' argument should be a directory a .zip file or a .tar.gz file")
+        raise ValueError(
+            f"The 'model' argument should be a directory a .zip file or a .tar.gz file"
+        )
 
 print("Load SCimilarity model", flush=True)
-scimilarity_embedding = scimilarity.cell_embedding.CellEmbedding(
-    model_path=model_dir
-)
+scimilarity_embedding = scimilarity.cell_embedding.CellEmbedding(model_path=model_dir)
 print("SCimilarity version:", scimilarity.__version__)
 
 print("Create input data", flush=True)

From cca571570786dd2a55424b1cfb521a8c8504b5fe Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 8 Oct 2024 15:49:58 +0200
Subject: [PATCH 09/23] Remove test resources from SCimiliarity config

---
 src/methods/scimilarity/config.vsh.yaml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/methods/scimilarity/config.vsh.yaml b/src/methods/scimilarity/config.vsh.yaml
index ae66ff58..02b6527c 100644
--- a/src/methods/scimilarity/config.vsh.yaml
+++ b/src/methods/scimilarity/config.vsh.yaml
@@ -21,13 +21,6 @@ resources:
   - type: python_script
     path: script.py
   - path: /src/utils/read_anndata_partial.py
-test_resources:
-  - type: python_script
-    path: /common/component_tests/check_config.py
-  - type: python_script
-    path: /common/component_tests/run_and_check_output.py
-  - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas
-    dest: resources_test/task_batch_integration/cxg_immune_cell_atlas
 engines:
   - type: docker
     image: openproblems/base_pytorch_nvidia:1.0.0

From 30e8b14b64230753abfede2e17e0866b06b0f125 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Fri, 11 Oct 2024 14:25:00 +0200
Subject: [PATCH 10/23] Fix file names in test resources state.yaml

---
 scripts/create_resources/test_resources.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh
index 8f458ccb..49d2fd93 100755
--- a/scripts/create_resources/test_resources.sh
+++ b/scripts/create_resources/test_resources.sh
@@ -40,10 +40,10 @@ viash run src/metrics/graph_connectivity/config.vsh.yaml -- \
 # write the state file
 cat > $DATASET_DIR/cxg_immune_cell_atlas/state.yaml << HERE
 id: cxg_immune_cell_atlas
-output_dataset: !file dataset_mod1.h5ad
-output_solution: !file solution_mod1.h5ad
-output_integrated: !file integrated_mod1.h5ad
-output_integrated_full: !file integrated_full_mod1.h5ad
+output_dataset: !file dataset.h5ad
+output_solution: !file solution.h5ad
+output_integrated: !file integrated.h5ad
+output_integrated_full: !file integrated_full.h5ad
 output_score: !file score_mod1.h5ad
 HERE
 

From b2188b5f9011c94a243ec758cf3b32ceb055d556 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Fri, 11 Oct 2024 14:26:22 +0200
Subject: [PATCH 11/23] Add scimilarity as dependency to benchmark workflow

---
 src/workflows/run_benchmark/config.vsh.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index f70f9b43..3ed43a1e 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -80,6 +80,7 @@ dependencies:
   - name: methods/scalex
   - name: methods/scanorama
   - name: methods/scanvi
+  - name: methods/scimilarity
   - name: methods/scvi
   # metrics
   - name: metrics/asw_batch

From ca95e446a49c458c97179396846b7f30a22047c7 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 16 Oct 2024 15:30:24 +0200
Subject: [PATCH 12/23] Update compute environment

---
 scripts/create_resources/resources.sh         | 2 +-
 scripts/run_benchmark/run_full_seqeracloud.sh | 2 +-
 scripts/run_benchmark/run_test_seqeracloud.sh | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh
index 58ac28a1..66b4eefb 100755
--- a/scripts/create_resources/resources.sh
+++ b/scripts/create_resources/resources.sh
@@ -19,7 +19,7 @@ tw launch https://github.com/openproblems-bio/task_batch_integration.git \
   --pull-latest \
   --main-script target/nextflow/workflows/process_datasets/main.nf \
   --workspace 53907369739130 \
-  --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
+  --compute-env 6UWsS5iw7TI37saKo2wcMi \
   --params-file /tmp/params.yaml \
   --entry-name auto \
   --config common/nextflow_helpers/labels_tw.config \
diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh
index 8f4bc92a..1e980239 100755
--- a/scripts/run_benchmark/run_full_seqeracloud.sh
+++ b/scripts/run_benchmark/run_full_seqeracloud.sh
@@ -25,7 +25,7 @@ tw launch https://github.com/openproblems-bio/task_batch_integration.git \
   --pull-latest \
   --main-script target/nextflow/workflows/run_benchmark/main.nf \
   --workspace 53907369739130 \
-  --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
+  --compute-env 6UWsS5iw7TI37saKo2wcMi \
   --params-file /tmp/params.yaml \
   --entry-name auto \
   --config common/nextflow_helpers/labels_tw.config \
diff --git a/scripts/run_benchmark/run_test_seqeracloud.sh b/scripts/run_benchmark/run_test_seqeracloud.sh
index 64056313..3645ad0f 100755
--- a/scripts/run_benchmark/run_test_seqeracloud.sh
+++ b/scripts/run_benchmark/run_test_seqeracloud.sh
@@ -21,7 +21,7 @@ tw launch https://github.com/openproblems-bio/task_batch_integration.git \
   --pull-latest \
   --main-script target/nextflow/workflows/run_benchmark/main.nf \
   --workspace 53907369739130 \
-  --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
+  --compute-env 6UWsS5iw7TI37saKo2wcMi \
   --params-file /tmp/params.yaml \
   --entry-name auto \
   --config common/nextflow_helpers/labels_tw.config \

From ce57335b75899a40e4fbc433330f671c32f280df Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Mon, 21 Oct 2024 16:38:36 +0200
Subject: [PATCH 13/23] Update model file path

---
 src/workflows/run_benchmark/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index e6743d18..2eff6d8d 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -27,7 +27,7 @@ methods = [
   scanorama,
   scanvi,
   scimilarity.run(
-    args: [model: file("https://zenodo.org/records/10685499/files/model_v1.1.tar.gz")]
+    args: [model: file("s3://openproblems-work/cache/scimilarity-model_v1.1.tar.gz")]
   ),
   scvi
 ]

From c60a7daafa8422042c0328eaee027d223dd5f091 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 22 Oct 2024 08:42:14 +0200
Subject: [PATCH 14/23] Create geneformer files

---
 src/methods/geneformer/config.vsh.yaml | 44 ++++++++++++++++++++++++++
 src/methods/geneformer/script.py       | 31 ++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 src/methods/geneformer/config.vsh.yaml
 create mode 100644 src/methods/geneformer/script.py

diff --git a/src/methods/geneformer/config.vsh.yaml b/src/methods/geneformer/config.vsh.yaml
new file mode 100644
index 00000000..f38592ac
--- /dev/null
+++ b/src/methods/geneformer/config.vsh.yaml
@@ -0,0 +1,44 @@
+__merge__: ../../api/comp_method.yaml
+
+name: geneformer
+label: Geneformer
+summary: Geneformer is a foundation transformer model pretrained on a large-scale corpus of single cell transcriptomes
+description: |
+  Geneformer is a foundation transformer model pretrained on a large-scale
+  corpus of single cell transcriptomes to enable context-aware predictions in
+  network biology. For this task, Geneformer is used to create a batch-corrected
+  cell embedding.
+references:
+  doi:
+    - 10.1038/s41586-023-06139-9
+    - 10.1101/2024.08.16.608180
+links:
+  documentation: https://geneformer.readthedocs.io/en/latest/index.html
+  repository: https://huggingface.co/ctheodoris/Geneformer
+
+info:
+  preferred_normalization: counts
+  preferred_types: [embedding]
+
+arguments:
+  - name: "--model"
+    type: "file"
+    description: Path to a Geneformer model file
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    setup:
+      - type: python
+        pip:
+        - git+https://huggingface.co/ctheodoris/Geneformer.git
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, midcpu]
diff --git a/src/methods/geneformer/script.py b/src/methods/geneformer/script.py
new file mode 100644
index 00000000..168865cd
--- /dev/null
+++ b/src/methods/geneformer/script.py
@@ -0,0 +1,31 @@
+import anndata as ad
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+  "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
+  'output': 'output.h5ad'
+}
+meta = {
+  'name': 'my_python_method'
+}
+## VIASH END
+
+print('Reading input files', flush=True)
+input = ad.read_h5ad(par['input'])
+
+print('Preprocess data', flush=True)
+# ... preprocessing ...
+
+print('Train model', flush=True)
+# ... train model ...
+
+print('Generate predictions', flush=True)
+# ... generate predictions ...
+
+print("Write output AnnData to file", flush=True)
+output = ad.AnnData(
+
+)
+output.write_h5ad(par['output'], compression='gzip')

From 99a7078961df5180a872661b33e08e3c9c46bc45 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 22 Oct 2024 13:56:14 +0200
Subject: [PATCH 15/23] Set SCimilarity name in Python script

---
 src/methods/scimilarity/script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/methods/scimilarity/script.py b/src/methods/scimilarity/script.py
index 2da1790e..761d59a5 100644
--- a/src/methods/scimilarity/script.py
+++ b/src/methods/scimilarity/script.py
@@ -14,7 +14,7 @@
     "model": "model_v1.1",
 }
 meta = {
-    "name": "scvi",
+    "name": "scimilarity",
 }
 ## VIASH END
 

From f4b98e137149a2426563620cb07d3f57e5a5aad1 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 22 Oct 2024 13:57:00 +0200
Subject: [PATCH 16/23] Adjust container settings

Depend on base method config because of input model file
---
 src/methods/geneformer/config.vsh.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/methods/geneformer/config.vsh.yaml b/src/methods/geneformer/config.vsh.yaml
index f38592ac..66287f01 100644
--- a/src/methods/geneformer/config.vsh.yaml
+++ b/src/methods/geneformer/config.vsh.yaml
@@ -1,4 +1,4 @@
-__merge__: ../../api/comp_method.yaml
+__merge__: /src/api/base_method.yaml
 
 name: geneformer
 label: Geneformer
@@ -31,14 +31,15 @@ resources:
 
 engines:
   - type: docker
-    image: openproblems/base_python:1.0.0
+    image: openproblems/base_pytorch_nvidia:1.0.0
     setup:
       - type: python
         pip:
+        - pyarrow<15.0.0a0,>=14.0.1
         - git+https://huggingface.co/ctheodoris/Geneformer.git
 
 runners:
   - type: executable
   - type: nextflow
     directives:
-      label: [midtime, midmem, midcpu]
+      label: [midtime, midmem, midcpu, gpu]

From fbaf8b17e36feea5fc68fbebe871ed702585839b Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 22 Oct 2024 14:46:07 +0200
Subject: [PATCH 17/23] Download dictionary files in script

---
 src/methods/geneformer/script.py | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/methods/geneformer/script.py b/src/methods/geneformer/script.py
index 168865cd..8b20c6cb 100644
--- a/src/methods/geneformer/script.py
+++ b/src/methods/geneformer/script.py
@@ -1,20 +1,47 @@
 import anndata as ad
+from geneformer import TranscriptomeTokenizer
+from tempfile import TemporaryDirectory
+import os
+import requests
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
 # in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
 par = {
-  "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
+  'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
   'output': 'output.h5ad'
 }
 meta = {
-  'name': 'my_python_method'
+  'name': 'geneformer'
 }
 ## VIASH END
 
 print('Reading input files', flush=True)
 input = ad.read_h5ad(par['input'])
 
+n_processors = os.cpu_count()
+
+# Mapping files for the 30M model
+base_dictionary_url = "https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m"
+dictionary_files = {
+    "ensembl_mapping" : "ensembl_mapping_dict_gc30M.pkl",
+    "gene_median" : "gene_median_dictionary_gc30M.pkl",
+    "gene_name_id" : "gene_name_id_dict_gc30M.pkl",
+    "token" : "token_dictionary_gc30M.pkl"
+}
+dictionary_dir = TemporaryDirectory()
+for file in dictionary_files.values():
+    url = os.path.join(base_dictionary_url, file)
+    response = requests.get(url)
+    with open(os.path.join(dictionary_dir.name, file), 'wb') as f:
+        f.write(response.content)
+
+# Set parameters for the 30M model
+model_input_size = 2048
+special_token = False
+tokenizer = TranscriptomeTokenizer(nproc = n_processors, model_input_size = model_input_size, special_token = special_token)
+# tokenizer.tokenize_data(data_directory, output_directory, output_prefix, file_format = "anndata")
+
 print('Preprocess data', flush=True)
 # ... preprocessing ...
 

From b1d152014fa68ba985150f009c12ca16850c7de7 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 23 Oct 2024 13:54:05 +0200
Subject: [PATCH 18/23] Prepare and tokenize data, attempt to embed

---
 src/methods/geneformer/config.vsh.yaml |   2 +
 src/methods/geneformer/script.py       | 133 ++++++++++++++++++-------
 2 files changed, 101 insertions(+), 34 deletions(-)

diff --git a/src/methods/geneformer/config.vsh.yaml b/src/methods/geneformer/config.vsh.yaml
index 66287f01..07c89788 100644
--- a/src/methods/geneformer/config.vsh.yaml
+++ b/src/methods/geneformer/config.vsh.yaml
@@ -28,6 +28,7 @@ arguments:
 resources:
   - type: python_script
     path: script.py
+  - path: /src/utils/read_anndata_partial.py
 
 engines:
   - type: docker
@@ -36,6 +37,7 @@ engines:
       - type: python
         pip:
         - pyarrow<15.0.0a0,>=14.0.1
+        - huggingface_hub
         - git+https://huggingface.co/ctheodoris/Geneformer.git
 
 runners:
diff --git a/src/methods/geneformer/script.py b/src/methods/geneformer/script.py
index 8b20c6cb..7c9e78bb 100644
--- a/src/methods/geneformer/script.py
+++ b/src/methods/geneformer/script.py
@@ -1,58 +1,123 @@
 import anndata as ad
-from geneformer import TranscriptomeTokenizer
-from tempfile import TemporaryDirectory
+from geneformer import TranscriptomeTokenizer, EmbExtractor
 import os
-import requests
+import sys
+from tempfile import TemporaryDirectory
+from huggingface_hub import hf_hub_download
+import numpy as np
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
 # in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
 par = {
-  'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad',
-  'output': 'output.h5ad'
-}
-meta = {
-  'name': 'geneformer'
+    "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
+    "output": "output.h5ad",
 }
+meta = {"name": "geneformer"}
 ## VIASH END
 
-print('Reading input files', flush=True)
-input = ad.read_h5ad(par['input'])
-
 n_processors = os.cpu_count()
 
+print("Reading input", flush=True)
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns")
+
+if adata.uns["dataset_organism"] != "homo_sapiens":
+    raise ValueError(
+        f"Geneformer can only be used with human data "
+        f"(dataset_organism == \"{adata.uns['dataset_organism']}\")"
+    )
+
+is_ensembl = all(var_name.startswith("ENSG") for var_name in adata.var_names)
+if not is_ensembl:
+    raise ValueError(f"Geneformer requires adata.var_names to contain ENSEMBL gene ids")
+
+print("Creating working directory", flush=True)
+work_dir = TemporaryDirectory()
+input_dir = os.path.join(work_dir.name, "input")
+os.makedirs(input_dir)
+tokenized_dir = os.path.join(work_dir.name, "tokenized")
+os.makedirs(tokenized_dir)
+embedding_dir = os.path.join(work_dir.name, "embedding")
+os.makedirs(embedding_dir)
+print(f"Working directory: {work_dir.name}", flush=True)
+
+print("Preparing data", flush=True)
+adata.var["ensembl_id"] = adata.var_names
+adata.obs["n_counts"] = np.ravel(adata.X.sum(axis=1))
+adata.write_h5ad(os.path.join(input_dir, "input.h5ad"))
+print(adata)
+
+print("Getting dictionary files", flush=True)
 # Mapping files for the 30M model
-base_dictionary_url = "https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m"
 dictionary_files = {
-    "ensembl_mapping" : "ensembl_mapping_dict_gc30M.pkl",
-    "gene_median" : "gene_median_dictionary_gc30M.pkl",
-    "gene_name_id" : "gene_name_id_dict_gc30M.pkl",
-    "token" : "token_dictionary_gc30M.pkl"
+    "ensembl_mapping": hf_hub_download(
+        repo_id="ctheodoris/Geneformer",
+        subfolder="geneformer/gene_dictionaries_30m",
+        filename="ensembl_mapping_dict_gc30M.pkl",
+    ),
+    "gene_median": hf_hub_download(
+        repo_id="ctheodoris/Geneformer",
+        subfolder="geneformer/gene_dictionaries_30m",
+        filename="gene_median_dictionary_gc30M.pkl",
+    ),
+    "gene_name_id": hf_hub_download(
+        repo_id="ctheodoris/Geneformer",
+        subfolder="geneformer/gene_dictionaries_30m",
+        filename="gene_name_id_dict_gc30M.pkl",
+    ),
+    "token": hf_hub_download(
+        repo_id="ctheodoris/Geneformer",
+        subfolder="geneformer/gene_dictionaries_30m",
+        filename="token_dictionary_gc30M.pkl",
+    ),
 }
-dictionary_dir = TemporaryDirectory()
-for file in dictionary_files.values():
-    url = os.path.join(base_dictionary_url, file)
-    response = requests.get(url)
-    with open(os.path.join(dictionary_dir.name, file), 'wb') as f:
-        f.write(response.content)
 
+print("Tokenizing data", flush=True)
 # Set parameters for the 30M model
 model_input_size = 2048
 special_token = False
-tokenizer = TranscriptomeTokenizer(nproc = n_processors, model_input_size = model_input_size, special_token = special_token)
-# tokenizer.tokenize_data(data_directory, output_directory, output_prefix, file_format = "anndata")
+tokenizer = TranscriptomeTokenizer(
+    nproc=n_processors,
+    model_input_size=model_input_size,
+    special_token=special_token,
+    gene_median_file=dictionary_files["gene_median"],
+    token_dictionary_file=dictionary_files["token"],
+    gene_mapping_file=dictionary_files["ensembl_mapping"],
+)
 
-print('Preprocess data', flush=True)
-# ... preprocessing ...
+tokenizer.tokenize_data(input_dir, tokenized_dir, "tokenized", file_format="h5ad")
+
+print("Getting model files", flush=True)
+model_files = {
+    "model": hf_hub_download(
+        repo_id="ctheodoris/Geneformer",
+        subfolder="gf-6L-30M-i2048",
+        filename="model.safetensors",
+    ),
+    "config": hf_hub_download(
+        repo_id="ctheodoris/Geneformer",
+        subfolder="gf-6L-30M-i2048",
+        filename="config.json",
+    ),
+}
+model_dir = os.path.dirname(model_files["model"])
 
-print('Train model', flush=True)
-# ... train model ...
+print("Extracting embeddings", flush=True)
+embedder = EmbExtractor(
+    emb_mode="cell", max_ncells=None, token_dictionary_file=dictionary_files["token"]
+)
+embedder.extract_embs(
+    model_dir,
+    os.path.join(tokenized_dir, "tokenized.dataset"),
+    embedding_dir,
+    "embedding",
+)
 
-print('Generate predictions', flush=True)
-# ... generate predictions ...
+# TODO: Get embedding from output directory, store and save output
 
 print("Write output AnnData to file", flush=True)
-output = ad.AnnData(
-
-)
-output.write_h5ad(par['output'], compression='gzip')
+output = ad.AnnData()
+output.write_h5ad(par["output"], compression="gzip")

From e3abd305fcff89c1a4e3cdbc12b3406fca490df5 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Mon, 28 Oct 2024 15:40:08 +0000
Subject: [PATCH 19/23] Store and output embedding

---
 src/methods/geneformer/script.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/methods/geneformer/script.py b/src/methods/geneformer/script.py
index 7c9e78bb..70491de3 100644
--- a/src/methods/geneformer/script.py
+++ b/src/methods/geneformer/script.py
@@ -5,6 +5,7 @@
 from tempfile import TemporaryDirectory
 from huggingface_hub import hf_hub_download
 import numpy as np
+import pandas as pd
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -115,9 +116,21 @@
     embedding_dir,
     "embedding",
 )
+embedding = pd.read_csv(os.path.join(embedding_dir, "embedding.csv")).to_numpy()
 
-# TODO: Get embedding from output directory, store and save output
+print("Store outputs", flush=True)
+output = ad.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    obsm={
+        "X_emb": embedding,
+    },
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "normalization_id": adata.uns["normalization_id"],
+        "method_id": meta["name"],
+    },
+)
 
 print("Write output AnnData to file", flush=True)
-output = ad.AnnData()
 output.write_h5ad(par["output"], compression="gzip")

From 1e00a52f853a11456a8a8ecec986a667d732edb7 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 29 Oct 2024 08:34:26 +0000
Subject: [PATCH 20/23] Add Geneformer to benchmark workflow

---
 src/methods/geneformer/config.vsh.yaml      | 2 +-
 src/workflows/run_benchmark/config.vsh.yaml | 1 +
 src/workflows/run_benchmark/main.nf         | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/methods/geneformer/config.vsh.yaml b/src/methods/geneformer/config.vsh.yaml
index 07c89788..dc1b04eb 100644
--- a/src/methods/geneformer/config.vsh.yaml
+++ b/src/methods/geneformer/config.vsh.yaml
@@ -18,7 +18,7 @@ links:
 
 info:
   preferred_normalization: counts
-  preferred_types: [embedding]
+  method_types: [embedding]
 
 arguments:
   - name: "--model"
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index 51e482ab..d3cc2b55 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -85,6 +85,7 @@ dependencies:
   - name: methods/batchelor_mnn_correct
   - name: methods/bbknn
   - name: methods/combat
+  - name: methods/geneformer
   - name: methods/harmony
   - name: methods/harmonypy
   - name: methods/liger
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index 69322a1a..89564bd5 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -20,6 +20,7 @@ methods = [
   batchelor_mnn_correct,
   bbknn,
   combat,
+  geneformer,
   harmony,
   harmonypy,
   liger,

From 113a892ea5339a33f10a68b1d6427f9f2ff0f815 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 29 Oct 2024 09:36:11 +0000
Subject: [PATCH 21/23] Add argument to select model version to use

---
 src/methods/geneformer/config.vsh.yaml | 17 ++++-
 src/methods/geneformer/script.py       | 93 +++++++++++++++-----------
 2 files changed, 69 insertions(+), 41 deletions(-)

diff --git a/src/methods/geneformer/config.vsh.yaml b/src/methods/geneformer/config.vsh.yaml
index dc1b04eb..832614a4 100644
--- a/src/methods/geneformer/config.vsh.yaml
+++ b/src/methods/geneformer/config.vsh.yaml
@@ -1,4 +1,4 @@
-__merge__: /src/api/base_method.yaml
+__merge__: /src/api/comp_method.yaml
 
 name: geneformer
 label: Geneformer
@@ -19,11 +19,22 @@ links:
 info:
   preferred_normalization: counts
   method_types: [embedding]
+  variants:
+    geneformer_12L_95M_i4096:
+      model: "gf-12L-95M-i4096"
+    geneformer_6L_30M_i2048:
+      model: "gf-6L-30M-i2048"
+    geneformer_12L_30M_i2048:
+      model: "gf-12L-30M-i2048"
+    geneformer_20L_95M_i4096:
+      model: "gf-20L-95M-i4096"
 
 arguments:
   - name: "--model"
-    type: "file"
-    description: Path to a Geneformer model file
+    type: "string"
+    description: String representing the Geneformer model to use
+    choices: ["gf-6L-30M-i2048", "gf-12L-30M-i2048", "gf-12L-95M-i4096", "gf-20L-95M-i4096"]
+    default: "gf-12L-95M-i4096"
 
 resources:
   - type: python_script
diff --git a/src/methods/geneformer/script.py b/src/methods/geneformer/script.py
index 70491de3..6c9a373e 100644
--- a/src/methods/geneformer/script.py
+++ b/src/methods/geneformer/script.py
@@ -13,13 +13,14 @@
 par = {
     "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
     "output": "output.h5ad",
+    "model": "gf-12L-95M-i4096"
 }
 meta = {"name": "geneformer"}
 ## VIASH END
 
 n_processors = os.cpu_count()
 
-print("Reading input", flush=True)
+print(">>> Reading input...", flush=True)
 sys.path.append(meta["resources_dir"])
 from read_anndata_partial import read_anndata
 
@@ -28,85 +29,99 @@
 if adata.uns["dataset_organism"] != "homo_sapiens":
     raise ValueError(
         f"Geneformer can only be used with human data "
-        f"(dataset_organism == \"{adata.uns['dataset_organism']}\")"
+        f"(dataset_organism == '{adata.uns['dataset_organism']}')"
     )
 
 is_ensembl = all(var_name.startswith("ENSG") for var_name in adata.var_names)
 if not is_ensembl:
     raise ValueError(f"Geneformer requires adata.var_names to contain ENSEMBL gene ids")
 
-print("Creating working directory", flush=True)
-work_dir = TemporaryDirectory()
-input_dir = os.path.join(work_dir.name, "input")
-os.makedirs(input_dir)
-tokenized_dir = os.path.join(work_dir.name, "tokenized")
-os.makedirs(tokenized_dir)
-embedding_dir = os.path.join(work_dir.name, "embedding")
-os.makedirs(embedding_dir)
-print(f"Working directory: {work_dir.name}", flush=True)
+print(f">>> Getting settings for model '{par['model']}'...", flush=True)
+model_split = par["model"].split("-")
+model_details = {
+    "layers": model_split[1],
+    "dataset": model_split[2],
+    "input_size": int(model_split[3][1:])
+}
+print(model_details, flush = True)
 
-print("Preparing data", flush=True)
-adata.var["ensembl_id"] = adata.var_names
-adata.obs["n_counts"] = np.ravel(adata.X.sum(axis=1))
-adata.write_h5ad(os.path.join(input_dir, "input.h5ad"))
-print(adata)
+print(">>> Getting model dictionary files...", flush=True)
+if model_details["dataset"] == "95M":
+    dictionaries_subfolder = "geneformer"
+elif model_details["dataset"] == "30M":
+    dictionaries_subfolder = "geneformer/gene_dictionaries_30m"
+else:
+    raise ValueError(f"Invalid model dataset: {model_details['dataset']}")
+print(f"Dictionaries subfolder: '{dictionaries_subfolder}'")
 
-print("Getting dictionary files", flush=True)
-# Mapping files for the 30M model
 dictionary_files = {
     "ensembl_mapping": hf_hub_download(
         repo_id="ctheodoris/Geneformer",
-        subfolder="geneformer/gene_dictionaries_30m",
-        filename="ensembl_mapping_dict_gc30M.pkl",
+        subfolder=dictionaries_subfolder,
+        filename=f"ensembl_mapping_dict_gc{model_details['dataset']}.pkl",
     ),
     "gene_median": hf_hub_download(
         repo_id="ctheodoris/Geneformer",
-        subfolder="geneformer/gene_dictionaries_30m",
-        filename="gene_median_dictionary_gc30M.pkl",
+        subfolder=dictionaries_subfolder,
+        filename=f"gene_median_dictionary_gc{model_details['dataset']}.pkl",
     ),
     "gene_name_id": hf_hub_download(
         repo_id="ctheodoris/Geneformer",
-        subfolder="geneformer/gene_dictionaries_30m",
-        filename="gene_name_id_dict_gc30M.pkl",
+        subfolder=dictionaries_subfolder,
+        filename=f"gene_name_id_dict_gc{model_details['dataset']}.pkl",
     ),
     "token": hf_hub_download(
         repo_id="ctheodoris/Geneformer",
-        subfolder="geneformer/gene_dictionaries_30m",
-        filename="token_dictionary_gc30M.pkl",
+        subfolder=dictionaries_subfolder,
+        filename=f"token_dictionary_gc{model_details['dataset']}.pkl",
     ),
 }
 
-print("Tokenizing data", flush=True)
-# Set parameters for the 30M model
-model_input_size = 2048
-special_token = False
+print(">>> Creating working directory...", flush=True)
+work_dir = TemporaryDirectory()
+input_dir = os.path.join(work_dir.name, "input")
+os.makedirs(input_dir)
+tokenized_dir = os.path.join(work_dir.name, "tokenized")
+os.makedirs(tokenized_dir)
+embedding_dir = os.path.join(work_dir.name, "embedding")
+os.makedirs(embedding_dir)
+print(f"Working directory: '{work_dir.name}'", flush=True)
+
+print(">>> Preparing data...", flush=True)
+adata.var["ensembl_id"] = adata.var_names
+adata.obs["n_counts"] = np.ravel(adata.X.sum(axis=1))
+adata.write_h5ad(os.path.join(input_dir, "input.h5ad"))
+print(adata)
+
+print(">>> Tokenizing data...", flush=True)
+special_token = model_details['dataset'] == "95M"
+print(f"Input size: {model_details['input_size']}, Special token: {special_token}")
 tokenizer = TranscriptomeTokenizer(
     nproc=n_processors,
-    model_input_size=model_input_size,
+    model_input_size=model_details["input_size"],
     special_token=special_token,
     gene_median_file=dictionary_files["gene_median"],
     token_dictionary_file=dictionary_files["token"],
     gene_mapping_file=dictionary_files["ensembl_mapping"],
 )
-
 tokenizer.tokenize_data(input_dir, tokenized_dir, "tokenized", file_format="h5ad")
 
-print("Getting model files", flush=True)
+print(f">>> Getting model files for model '{par['model']}'...", flush=True)
 model_files = {
     "model": hf_hub_download(
         repo_id="ctheodoris/Geneformer",
-        subfolder="gf-6L-30M-i2048",
+        subfolder=par["model"],
         filename="model.safetensors",
     ),
     "config": hf_hub_download(
         repo_id="ctheodoris/Geneformer",
-        subfolder="gf-6L-30M-i2048",
+        subfolder=par["model"],
         filename="config.json",
     ),
 }
 model_dir = os.path.dirname(model_files["model"])
 
-print("Extracting embeddings", flush=True)
+print(">>> Extracting embeddings...", flush=True)
 embedder = EmbExtractor(
     emb_mode="cell", max_ncells=None, token_dictionary_file=dictionary_files["token"]
 )
@@ -118,7 +133,7 @@
 )
 embedding = pd.read_csv(os.path.join(embedding_dir, "embedding.csv")).to_numpy()
 
-print("Store outputs", flush=True)
+print(">>> Storing outputs...", flush=True)
 output = ad.AnnData(
     obs=adata.obs[[]],
     var=adata.var[[]],
@@ -131,6 +146,8 @@
         "method_id": meta["name"],
     },
 )
+print(output)
 
-print("Write output AnnData to file", flush=True)
+print(">>> Writing output AnnData to file...", flush=True)
 output.write_h5ad(par["output"], compression="gzip")
+print(">>> Done!")

From f26eb248a81bbedefdf39e8e198dc4d88c320b8c Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Tue, 29 Oct 2024 11:00:37 +0100
Subject: [PATCH 22/23] Style Geneformer script

---
 src/methods/geneformer/script.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/methods/geneformer/script.py b/src/methods/geneformer/script.py
index 6c9a373e..eeab4332 100644
--- a/src/methods/geneformer/script.py
+++ b/src/methods/geneformer/script.py
@@ -1,11 +1,12 @@
-import anndata as ad
-from geneformer import TranscriptomeTokenizer, EmbExtractor
 import os
 import sys
 from tempfile import TemporaryDirectory
-from huggingface_hub import hf_hub_download
+
+import anndata as ad
 import numpy as np
 import pandas as pd
+from geneformer import EmbExtractor, TranscriptomeTokenizer
+from huggingface_hub import hf_hub_download
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -13,7 +14,7 @@
 par = {
     "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
     "output": "output.h5ad",
-    "model": "gf-12L-95M-i4096"
+    "model": "gf-12L-95M-i4096",
 }
 meta = {"name": "geneformer"}
 ## VIASH END
@@ -41,9 +42,9 @@
 model_details = {
     "layers": model_split[1],
     "dataset": model_split[2],
-    "input_size": int(model_split[3][1:])
+    "input_size": int(model_split[3][1:]),
 }
-print(model_details, flush = True)
+print(model_details, flush=True)
 
 print(">>> Getting model dictionary files...", flush=True)
 if model_details["dataset"] == "95M":
@@ -94,7 +95,7 @@
 print(adata)
 
 print(">>> Tokenizing data...", flush=True)
-special_token = model_details['dataset'] == "95M"
+special_token = model_details["dataset"] == "95M"
 print(f"Input size: {model_details['input_size']}, Special token: {special_token}")
 tokenizer = TranscriptomeTokenizer(
     nproc=n_processors,

From 98789b13ed933e28a57557b5a1ed7081390a5579 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 30 Oct 2024 09:40:33 +0100
Subject: [PATCH 23/23] Make Geneformer inherit from base_method for tests

---
 src/methods/geneformer/config.vsh.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/methods/geneformer/config.vsh.yaml b/src/methods/geneformer/config.vsh.yaml
index 832614a4..d571a4ad 100644
--- a/src/methods/geneformer/config.vsh.yaml
+++ b/src/methods/geneformer/config.vsh.yaml
@@ -1,4 +1,4 @@
-__merge__: /src/api/comp_method.yaml
+__merge__: /src/api/base_method.yaml
 
 name: geneformer
 label: Geneformer