From 4f5ba9506db3e8f50ee5d1bcb65e09a374c1354b Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 30 Oct 2024 12:08:49 +0100
Subject: [PATCH 1/9] Create UCE component files

---
 src/methods/uce/config.vsh.yaml | 42 +++++++++++++++++++++++++++++++++
 src/methods/uce/script.py       | 31 ++++++++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 src/methods/uce/config.vsh.yaml
 create mode 100644 src/methods/uce/script.py

diff --git a/src/methods/uce/config.vsh.yaml b/src/methods/uce/config.vsh.yaml
new file mode 100644
index 00000000..fa159578
--- /dev/null
+++ b/src/methods/uce/config.vsh.yaml
@@ -0,0 +1,42 @@
+__merge__: ../../api/comp_method.yaml
+
+name: uce
+label: UCE
+summary: UCE offers a unified biological latent space that can represent any cell
+description: |
+  Universal Cell Embedding (UCE) is a single-cell foundation model that offers a
+  unified biological latent space that can represent any cell, regardless of
+  tissue or species
+references:
+  doi:
+    - 10.1101/2023.11.28.568918
+links:
+  documentation: https://github.com/snap-stanford/UCE/blob/main/README.md
+  repository: https://github.com/snap-stanford/UCE
+
+info:
+  method_types: [embedding]
+  preferred_normalization: counts
+
+# Component-specific parameters (optional)
+# arguments:
+#   - name: "--n_neighbors"
+#     type: "integer"
+#     default: 5
+#     description: Number of neighbors to use.
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_pytorch_nvidia:1.0.0
+    setup:
+      - type: docker
+        run: "git clone https://github.com/snap-stanford/UCE.git"
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu,gpu]
diff --git a/src/methods/uce/script.py b/src/methods/uce/script.py
new file mode 100644
index 00000000..92780f93
--- /dev/null
+++ b/src/methods/uce/script.py
@@ -0,0 +1,31 @@
+import anndata as ad
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+  'input': 'resources_test/.../input.h5ad',
+  'output': 'output.h5ad'
+}
+meta = {
+  'name': 'my_python_method'
+}
+## VIASH END
+
+print('Reading input files', flush=True)
+input = ad.read_h5ad(par['input'])
+
+print('Preprocess data', flush=True)
+# ... preprocessing ...
+
+print('Train model', flush=True)
+# ... train model ...
+
+print('Generate predictions', flush=True)
+# ... generate predictions ...
+
+print("Write output AnnData to file", flush=True)
+output = ad.AnnData(
+
+)
+output.write_h5ad(par['output'], compression='gzip')

From 68fa6192e33c22f9586c43a60bc3a4b60123fd5a Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 30 Oct 2024 15:44:53 +0100
Subject: [PATCH 2/9] Add UCE dataset preprocessing

---
 src/methods/uce/config.vsh.yaml |  19 +++---
 src/methods/uce/script.py       | 104 +++++++++++++++++++++++++++++---
 2 files changed, 106 insertions(+), 17 deletions(-)

diff --git a/src/methods/uce/config.vsh.yaml b/src/methods/uce/config.vsh.yaml
index fa159578..2301ef51 100644
--- a/src/methods/uce/config.vsh.yaml
+++ b/src/methods/uce/config.vsh.yaml
@@ -1,4 +1,4 @@
-__merge__: ../../api/comp_method.yaml
+__merge__: ../../api/base_method.yaml
 
 name: uce
 label: UCE
@@ -18,25 +18,28 @@ info:
   method_types: [embedding]
   preferred_normalization: counts
 
-# Component-specific parameters (optional)
-# arguments:
-#   - name: "--n_neighbors"
-#     type: "integer"
-#     default: 5
-#     description: Number of neighbors to use.
+arguments:
+  - name: --model
+    type: file
+    description: Path to the directory containing UCE model files or a .zip/.tar.gz archive
+    required: true
 
 resources:
   - type: python_script
     path: script.py
+  - path: /src/utils/read_anndata_partial.py
 
 engines:
   - type: docker
     image: openproblems/base_pytorch_nvidia:1.0.0
     setup:
+      - type: python
+        pypi:
+          - accelerate==0.24.0
       - type: docker
         run: "git clone https://github.com/snap-stanford/UCE.git"
 runners:
   - type: executable
   - type: nextflow
     directives:
-      label: [midtime,midmem,midcpu,gpu]
+      label: [midtime, midmem, midcpu, gpu]
diff --git a/src/methods/uce/script.py b/src/methods/uce/script.py
index 92780f93..9c7261b2 100644
--- a/src/methods/uce/script.py
+++ b/src/methods/uce/script.py
@@ -1,25 +1,104 @@
+import sys
+import tempfile
+import os
+import zipfile
+import tarfile
+import pandas as pd
+import numpy as np
+
+from accelerate import Accelerator
+
 import anndata as ad
 
+os.chdir("UCE")
+sys.path.append(".")
+from data_proc.data_utils import process_raw_anndata
+
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
 # in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
 par = {
-  'input': 'resources_test/.../input.h5ad',
-  'output': 'output.h5ad'
+    "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
+    "output": "output.h5ad",
 }
 meta = {
-  'name': 'my_python_method'
+  'name': 'uce'
 }
 ## VIASH END
 
-print('Reading input files', flush=True)
-input = ad.read_h5ad(par['input'])
 
-print('Preprocess data', flush=True)
-# ... preprocessing ...
+print(">>> Reading input...", flush=True)
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns")
+
+print("\n>>> Creating working directory...", flush=True)
+work_dir = tempfile.TemporaryDirectory()
+print(f"Working directory: '{work_dir.name}'", flush=True)
 
-print('Train model', flush=True)
-# ... train model ...
+print("\n>>> Getting model files...", flush=True)
+if os.path.isdir(par["model"]):
+    model_temp = None
+    model_dir = par["model"]
+else:
+    model_temp = tempfile.TemporaryDirectory()
+    model_dir = model_temp.name
+
+    if zipfile.is_zipfile(par["model"]):
+        print("Extracting UCE model from .zip...", flush=True)
+        with zipfile.ZipFile(par["model"], "r") as zip_file:
+            zip_file.extractall(model_dir)
+    elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"):
+        print("Extracting model from .tar.gz...", flush=True)
+        with tarfile.open(par["model"], "r:gz") as tar_file:
+            tar_file.extractall(model_dir)
+            model_dir = os.path.join(model_dir, os.listdir(model_dir)[0])
+    else:
+        raise ValueError(
+            f"The 'model' argument should be a directory a .zip file or a .tar.gz file"
+        )
+
+print("Extracting protein embeddings...", flush=True)
+with tarfile.open(os.path.join(model_dir, "protein_embeddings.tar.gz"), "r:gz") as tar_file:
+    tar_file.extractall("./model_files")
+print(f"Model directory: '{model_dir}'", flush=True)
+
+model_args = {
+    "dir" : work_dir.name,
+    "skip" : True,
+    "filter" : False # Turn this off to get embedding for all cells
+}
+
+accelerator = Accelerator(model_args["dir"])
+
+print("\n>>> Preprocessing data...", flush=True)
+# Set var names to gene symbols
+adata.var_names = adata.var["feature_name"]
+adata.write_h5ad(os.path.join(model_args["dir"], "input.h5ad"))
+
+row = pd.Series()
+row.path = "input.h5ad"
+row.covar_col = np.nan
+if adata.uns["dataset_organism"] == "homo_sapiens":
+    row.species = "human"
+elif adata.uns["dataset_organism"] == "mus_musculus":
+    row.species = "mouse"
+else:
+    raise ValueError(f"Species '{adata.uns['dataset_organism']} not yet implemented")
+
+processed_adata, num_cells, num_genes = process_raw_anndata(
+    row = row,
+    h5_folder_path = model_args["dir"],
+    npz_folder_path = model_args["dir"],
+    scp = "",
+    skip = model_args["skip"],
+    additional_filter = model_args["filter"],
+    root = model_args["dir"]
+)
+
+# processor.generate_idxs()
+# processor.run_evaluation()
 
 print('Generate predictions', flush=True)
 # ... generate predictions ...
@@ -29,3 +108,10 @@
 
 )
 output.write_h5ad(par['output'], compression='gzip')
+
+print("\n>>> Cleaning up temporary directories...", flush=True)
+work_dir.cleanup()
+if model_temp is not None:
+    model_temp.cleanup()
+
+print("\n>>> Done!", flush=True)

From 02a395f3e10a1fad6eac58bf44f76f7a2bd72490 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Wed, 30 Oct 2024 16:18:25 +0100
Subject: [PATCH 3/9] Generate UCE indexes

---
 src/methods/uce/script.py | 54 +++++++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 11 deletions(-)

diff --git a/src/methods/uce/script.py b/src/methods/uce/script.py
index 9c7261b2..628ac394 100644
--- a/src/methods/uce/script.py
+++ b/src/methods/uce/script.py
@@ -5,6 +5,8 @@
 import tarfile
 import pandas as pd
 import numpy as np
+import pickle
+import torch
 
 from accelerate import Accelerator
 
@@ -12,7 +14,7 @@
 
 os.chdir("UCE")
 sys.path.append(".")
-from data_proc.data_utils import process_raw_anndata
+from data_proc.data_utils import process_raw_anndata, get_species_to_pe, get_spec_chrom_csv, adata_path_to_prot_chrom_starts
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -22,7 +24,7 @@
     "output": "output.h5ad",
 }
 meta = {
-  'name': 'uce'
+    'name': 'uce'
 }
 ## VIASH END
 
@@ -33,6 +35,13 @@
 
 adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns")
 
+if adata.uns["dataset_organism"] == "homo_sapiens":
+    species = "human"
+elif adata.uns["dataset_organism"] == "mus_musculus":
+    species = "mouse"
+else:
+    raise ValueError(f"Species '{adata.uns['dataset_organism']} not yet implemented")
+
 print("\n>>> Creating working directory...", flush=True)
 work_dir = tempfile.TemporaryDirectory()
 print(f"Working directory: '{work_dir.name}'", flush=True)
@@ -59,19 +68,31 @@
             f"The 'model' argument should be a directory a .zip file or a .tar.gz file"
         )
 
+print(f"Model directory: '{model_dir}'", flush=True)
+
 print("Extracting protein embeddings...", flush=True)
 with tarfile.open(os.path.join(model_dir, "protein_embeddings.tar.gz"), "r:gz") as tar_file:
     tar_file.extractall("./model_files")
-print(f"Model directory: '{model_dir}'", flush=True)
+protein_embeddings_dir = os.path.join("./model_files", "protein_embeddings")
+print(f"Protein embeddings directory: '{protein_embeddings_dir}'", flush=True)
 
+# The following sections implement methods in the UCE.evaluate.AnndataProcessor
+# class due to the object not being compatible with the Open Problems setup
 model_args = {
     "dir" : work_dir.name,
     "skip" : True,
-    "filter" : False # Turn this off to get embedding for all cells
+    "filter" : False, # Turn this off to get embedding for all cells
+    "name" : "input",
+    "offset_pkl_path" : os.path.join(model_dir, "species_offsets.pkl"),
+    "spec_chrom_csv_path" : os.path.join(model_dir, "species_chrom.csv"),
+    "pe_idx_path" : os.path.join(work_dir.name, "input_pe_row_idxs.pt"),
+    "chroms_path" : os.path.join(work_dir.name, "input_chroms.pkl"),
+    "starts_path" : os.path.join(work_dir.name, "input_starts.pkl"),
 }
 
 accelerator = Accelerator(model_args["dir"])
 
+# AnndataProcessor.preprocess_anndata()
 print("\n>>> Preprocessing data...", flush=True)
 # Set var names to gene symbols
 adata.var_names = adata.var["feature_name"]
@@ -80,12 +101,7 @@
 row = pd.Series()
 row.path = "input.h5ad"
 row.covar_col = np.nan
-if adata.uns["dataset_organism"] == "homo_sapiens":
-    row.species = "human"
-elif adata.uns["dataset_organism"] == "mus_musculus":
-    row.species = "mouse"
-else:
-    raise ValueError(f"Species '{adata.uns['dataset_organism']} not yet implemented")
+row.species = species
 
 processed_adata, num_cells, num_genes = process_raw_anndata(
     row = row,
@@ -97,7 +113,23 @@
     root = model_args["dir"]
 )
 
-# processor.generate_idxs()
+# AnndataProcessor.generate_idxs()
+print("\n>>> Generating indexes...", flush=True)
+species_to_pe = get_species_to_pe(protein_embeddings_dir)
+with open(model_args["offset_pkl_path"], "rb") as f:
+    species_to_offsets = pickle.load(f)
+gene_to_chrom_pos = get_spec_chrom_csv(model_args["spec_chrom_csv_path"])
+spec_pe_genes = list(species_to_pe[species].keys())
+offset = species_to_offsets[species]
+pe_row_idxs, dataset_chroms, dataset_pos = adata_path_to_prot_chrom_starts(
+    processed_adata, species, spec_pe_genes, gene_to_chrom_pos, offset
+)
+torch.save({model_args["name"]: pe_row_idxs}, model_args["pe_idx_path"])
+with open(model_args["chroms_path"], "wb+") as f:
+    pickle.dump({model_args["name"]: dataset_chroms}, f)
+with open(model_args["starts_path"], "wb+") as f:
+    pickle.dump({model_args["name"]: dataset_pos}, f)
+
 # processor.run_evaluation()
 
 print('Generate predictions', flush=True)

From 3332f0273a14060a1f6619bba8eac70d17b7133a Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Thu, 31 Oct 2024 08:37:32 +0100
Subject: [PATCH 4/9] Evaluate UCE model and output results

---
 src/methods/uce/script.py | 65 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 8 deletions(-)

diff --git a/src/methods/uce/script.py b/src/methods/uce/script.py
index 628ac394..b70d521c 100644
--- a/src/methods/uce/script.py
+++ b/src/methods/uce/script.py
@@ -7,14 +7,17 @@
 import numpy as np
 import pickle
 import torch
+from argparse import Namespace
 
 from accelerate import Accelerator
 
 import anndata as ad
 
+# Code has hardcoded paths that only work correctly inside the UCE directory
 os.chdir("UCE")
 sys.path.append(".")
 from data_proc.data_utils import process_raw_anndata, get_species_to_pe, get_spec_chrom_csv, adata_path_to_prot_chrom_starts
+from evaluate import run_eval
 
 ## VIASH START
 # Note: this section is auto-generated by viash at runtime. To edit it, make changes
@@ -79,7 +82,7 @@
 # The following sections implement methods in the UCE.evaluate.AnndataProcessor
 # class due to the object not being compatible with the Open Problems setup
 model_args = {
-    "dir" : work_dir.name,
+    "dir" : work_dir.name + "/",
     "skip" : True,
     "filter" : False, # Turn this off to get embedding for all cells
     "name" : "input",
@@ -90,8 +93,6 @@
     "starts_path" : os.path.join(work_dir.name, "input_starts.pkl"),
 }
 
-accelerator = Accelerator(model_args["dir"])
-
 # AnndataProcessor.preprocess_anndata()
 print("\n>>> Preprocessing data...", flush=True)
 # Set var names to gene symbols
@@ -130,15 +131,63 @@
 with open(model_args["starts_path"], "wb+") as f:
     pickle.dump({model_args["name"]: dataset_pos}, f)
 
-# processor.run_evaluation()
+# AnndataProcessor.run_evaluation()
+print("\n>>> Evaluating model...", flush=True)
+model_parameters = Namespace(
+    token_dim = 5120,
+    d_hid = 5120,
+    nlayers = 4, # Small model = 4, full model = 33
+    output_dim = 1280,
+    multi_gpu= False,
+    token_file = os.path.join(model_dir, "all_tokens.torch"),
+    dir = model_args["dir"],
+    pad_length = 1536,
+    sample_size = 1024,
+    cls_token_idx = 3,
+    CHROM_TOKEN_OFFSET = 143574,
+    chrom_token_right_idx = 2,
+    chrom_token_left_idx = 1,
+    pad_token_idx = 0
+)
 
-print('Generate predictions', flush=True)
-# ... generate predictions ...
+if model_parameters.nlayers == 4:
+    model_parameters.model_loc = os.path.join(model_dir, "4layer_model.torch")
+    model_parameters.batch_size = 100
+else:
+    model_parameters.model_loc = os.path.join(model_dir, "33l_8ep_1024t_1280.torch")
+    model_parameters.batch_size = 25
+
+accelerator = Accelerator(project_dir=model_args["dir"])
+accelerator.wait_for_everyone()
+shapes_dict = {model_args["name"]: (num_cells, num_genes)}
+run_eval(
+    adata = processed_adata,
+    name = model_args["name"],
+    pe_idx_path = model_args["pe_idx_path"],
+    chroms_path = model_args["chroms_path"],
+    starts_path = model_args["starts_path"],
+    shapes_dict = shapes_dict,
+    accelerator = accelerator,
+    args = model_parameters
+)
 
-print("Write output AnnData to file", flush=True)
+print("\n>>> Storing output...", flush=True)
+embedded_adata = ad.read_h5ad(os.path.join(model_args["dir"], "input_uce_adata.h5ad"))
 output = ad.AnnData(
-
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    obsm={
+        "X_emb": embedded_adata.obsm["X_uce"],
+    },
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "normalization_id": adata.uns["normalization_id"],
+        "method_id": meta["name"],
+    },
 )
+print(output)
+
+print("\n>>> Writing output AnnData to file...", flush=True)
 output.write_h5ad(par['output'], compression='gzip')
 
 print("\n>>> Cleaning up temporary directories...", flush=True)

From 7716426d7dbe7cfe98c13634831441f26eec9089 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Thu, 31 Oct 2024 09:57:21 +0100
Subject: [PATCH 5/9] Add UCE to benchmark workflow

---
 src/workflows/run_benchmark/config.vsh.yaml | 1 +
 src/workflows/run_benchmark/main.nf         | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index 51e482ab..222361d9 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -95,6 +95,7 @@ dependencies:
   - name: methods/scanvi
   - name: methods/scimilarity
   - name: methods/scvi
+  - name: methods/uce
   # metrics
   - name: metrics/asw_batch
   - name: metrics/asw_label
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index 69322a1a..c7360cee 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -31,7 +31,10 @@ methods = [
   scimilarity.run(
     args: [model: file("s3://openproblems-work/cache/scimilarity-model_v1.1.tar.gz")]
   ),
-  scvi
+  scvi,
+  uce.run(
+    args: [model: file("s3://openproblems-work/cache/scimilarity-model_v1.1.tar.gz")]
+  ),
 ]
 
 // construct list of metrics

From acc283b39666f4c1449271866be504e365094b7f Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Thu, 31 Oct 2024 10:15:33 +0100
Subject: [PATCH 6/9] Fix UCE model path in benchmark workflow

---
 src/workflows/run_benchmark/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index c7360cee..8ef69c6b 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -33,7 +33,7 @@ methods = [
   ),
   scvi,
   uce.run(
-    args: [model: file("s3://openproblems-work/cache/scimilarity-model_v1.1.tar.gz")]
+    args: [model: file("s3://openproblems-work/cache/uce-model-v5.zip")]
   ),
 ]
 

From cd14b3a4bfae5883a676d9966f4d313cf1177098 Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Thu, 31 Oct 2024 11:16:11 +0100
Subject: [PATCH 7/9] Copy UCE files to working directory for Nextflow

---
 src/methods/uce/script.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/methods/uce/script.py b/src/methods/uce/script.py
index b70d521c..658d54aa 100644
--- a/src/methods/uce/script.py
+++ b/src/methods/uce/script.py
@@ -14,7 +14,15 @@
 import anndata as ad
 
 # Code has hardcoded paths that only work correctly inside the UCE directory
-os.chdir("UCE")
+if os.path.isdir("UCE"):
+    # For executable we can work inside the UCE directory
+    os.chdir("UCE")
+else:
+    # For Nextflow we need to copy files to the Nextflow working directory
+    print(">>> Copying UCE files to local directory...", flush=True)
+    import shutil
+    shutil.copytree("/workspace/UCE", ".", dirs_exist_ok=True)
+
 sys.path.append(".")
 from data_proc.data_utils import process_raw_anndata, get_species_to_pe, get_spec_chrom_csv, adata_path_to_prot_chrom_starts
 from evaluate import run_eval

From ab5bb0751987a98f4c0432ebe6a008cea1dcc23c Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Thu, 31 Oct 2024 11:54:57 +0100
Subject: [PATCH 8/9] Exclude UCE in local benchmark scripts

Requires more memory than allowed by the local labels config
---
 scripts/run_benchmark/run_full_local.sh | 1 +
 scripts/run_benchmark/run_test_local.sh | 1 +
 src/methods/uce/script.py               | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh
index 5c83ddb3..d823d79e 100755
--- a/scripts/run_benchmark/run_full_local.sh
+++ b/scripts/run_benchmark/run_full_local.sh
@@ -26,6 +26,7 @@ input_states: resources/datasets/**/state.yaml
 rename_keys: 'input_dataset:output_dataset;input_solution:output_solution'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
+settings: '{"methods_exclude": ["uce"]}'
 HERE
 
 # run the benchmark
diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh
index d0fba746..2b72eeed 100755
--- a/scripts/run_benchmark/run_test_local.sh
+++ b/scripts/run_benchmark/run_test_local.sh
@@ -21,6 +21,7 @@ input_states: resources_test/task_batch_integration/**/state.yaml
 rename_keys: 'input_dataset:output_dataset;input_solution:output_solution'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
+settings: '{"methods_exclude": ["uce"]}'
 HERE
 
 nextflow run . \
diff --git a/src/methods/uce/script.py b/src/methods/uce/script.py
index 658d54aa..314504a9 100644
--- a/src/methods/uce/script.py
+++ b/src/methods/uce/script.py
@@ -144,7 +144,7 @@
 model_parameters = Namespace(
     token_dim = 5120,
     d_hid = 5120,
-    nlayers = 4, # Small model = 4, full model = 33
+    nlayers = 33, # Small model = 4, full model = 33
     output_dim = 1280,
     multi_gpu= False,
     token_file = os.path.join(model_dir, "all_tokens.torch"),

From af5c1a9c377232bfebf9c3d0d2ffdffda97066bf Mon Sep 17 00:00:00 2001
From: Luke Zappia <luke@data-intuitive.com>
Date: Thu, 31 Oct 2024 12:02:32 +0100
Subject: [PATCH 9/9] Style UCE script

---
 src/methods/uce/script.py | 113 ++++++++++++++++++++------------------
 1 file changed, 59 insertions(+), 54 deletions(-)

diff --git a/src/methods/uce/script.py b/src/methods/uce/script.py
index 314504a9..24108a94 100644
--- a/src/methods/uce/script.py
+++ b/src/methods/uce/script.py
@@ -1,17 +1,16 @@
+import os
+import pickle
 import sys
+import tarfile
 import tempfile
-import os
 import zipfile
-import tarfile
-import pandas as pd
-import numpy as np
-import pickle
-import torch
 from argparse import Namespace
 
-from accelerate import Accelerator
-
 import anndata as ad
+import numpy as np
+import pandas as pd
+import torch
+from accelerate import Accelerator
 
 # Code has hardcoded paths that only work correctly inside the UCE directory
 if os.path.isdir("UCE"):
@@ -21,10 +20,17 @@
     # For Nextflow we need to copy files to the Nextflow working directory
     print(">>> Copying UCE files to local directory...", flush=True)
     import shutil
+
     shutil.copytree("/workspace/UCE", ".", dirs_exist_ok=True)
 
+# Append current directory to import UCE functions
 sys.path.append(".")
-from data_proc.data_utils import process_raw_anndata, get_species_to_pe, get_spec_chrom_csv, adata_path_to_prot_chrom_starts
+from data_proc.data_utils import (
+    adata_path_to_prot_chrom_starts,
+    get_spec_chrom_csv,
+    get_species_to_pe,
+    process_raw_anndata,
+)
 from evaluate import run_eval
 
 ## VIASH START
@@ -34,12 +40,9 @@
     "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
     "output": "output.h5ad",
 }
-meta = {
-    'name': 'uce'
-}
+meta = {"name": "uce"}
 ## VIASH END
 
-
 print(">>> Reading input...", flush=True)
 sys.path.append(meta["resources_dir"])
 from read_anndata_partial import read_anndata
@@ -51,7 +54,7 @@
 elif adata.uns["dataset_organism"] == "mus_musculus":
     species = "mouse"
 else:
-    raise ValueError(f"Species '{adata.uns['dataset_organism']} not yet implemented")
+    raise ValueError(f"Species '{adata.uns['dataset_organism']}' not yet implemented")
 
 print("\n>>> Creating working directory...", flush=True)
 work_dir = tempfile.TemporaryDirectory()
@@ -82,7 +85,9 @@
 print(f"Model directory: '{model_dir}'", flush=True)
 
 print("Extracting protein embeddings...", flush=True)
-with tarfile.open(os.path.join(model_dir, "protein_embeddings.tar.gz"), "r:gz") as tar_file:
+with tarfile.open(
+    os.path.join(model_dir, "protein_embeddings.tar.gz"), "r:gz"
+) as tar_file:
     tar_file.extractall("./model_files")
 protein_embeddings_dir = os.path.join("./model_files", "protein_embeddings")
 print(f"Protein embeddings directory: '{protein_embeddings_dir}'", flush=True)
@@ -90,15 +95,15 @@
 # The following sections implement methods in the UCE.evaluate.AnndataProcessor
 # class due to the object not being compatible with the Open Problems setup
 model_args = {
-    "dir" : work_dir.name + "/",
-    "skip" : True,
-    "filter" : False, # Turn this off to get embedding for all cells
-    "name" : "input",
-    "offset_pkl_path" : os.path.join(model_dir, "species_offsets.pkl"),
-    "spec_chrom_csv_path" : os.path.join(model_dir, "species_chrom.csv"),
-    "pe_idx_path" : os.path.join(work_dir.name, "input_pe_row_idxs.pt"),
-    "chroms_path" : os.path.join(work_dir.name, "input_chroms.pkl"),
-    "starts_path" : os.path.join(work_dir.name, "input_starts.pkl"),
+    "dir": work_dir.name + "/",
+    "skip": True,
+    "filter": False,  # Turn this off to get embedding for all cells
+    "name": "input",
+    "offset_pkl_path": os.path.join(model_dir, "species_offsets.pkl"),
+    "spec_chrom_csv_path": os.path.join(model_dir, "species_chrom.csv"),
+    "pe_idx_path": os.path.join(work_dir.name, "input_pe_row_idxs.pt"),
+    "chroms_path": os.path.join(work_dir.name, "input_chroms.pkl"),
+    "starts_path": os.path.join(work_dir.name, "input_starts.pkl"),
 }
 
 # AnndataProcessor.preprocess_anndata()
@@ -113,13 +118,13 @@
 row.species = species
 
 processed_adata, num_cells, num_genes = process_raw_anndata(
-    row = row,
-    h5_folder_path = model_args["dir"],
-    npz_folder_path = model_args["dir"],
-    scp = "",
-    skip = model_args["skip"],
-    additional_filter = model_args["filter"],
-    root = model_args["dir"]
+    row=row,
+    h5_folder_path=model_args["dir"],
+    npz_folder_path=model_args["dir"],
+    scp="",
+    skip=model_args["skip"],
+    additional_filter=model_args["filter"],
+    root=model_args["dir"],
 )
 
 # AnndataProcessor.generate_idxs()
@@ -142,20 +147,20 @@
 # AnndataProcessor.run_evaluation()
 print("\n>>> Evaluating model...", flush=True)
 model_parameters = Namespace(
-    token_dim = 5120,
-    d_hid = 5120,
-    nlayers = 33, # Small model = 4, full model = 33
-    output_dim = 1280,
-    multi_gpu= False,
-    token_file = os.path.join(model_dir, "all_tokens.torch"),
-    dir = model_args["dir"],
-    pad_length = 1536,
-    sample_size = 1024,
-    cls_token_idx = 3,
-    CHROM_TOKEN_OFFSET = 143574,
-    chrom_token_right_idx = 2,
-    chrom_token_left_idx = 1,
-    pad_token_idx = 0
+    token_dim=5120,
+    d_hid=5120,
+    nlayers=33,  # Small model = 4, full model = 33
+    output_dim=1280,
+    multi_gpu=False,
+    token_file=os.path.join(model_dir, "all_tokens.torch"),
+    dir=model_args["dir"],
+    pad_length=1536,
+    sample_size=1024,
+    cls_token_idx=3,
+    CHROM_TOKEN_OFFSET=143574,
+    chrom_token_right_idx=2,
+    chrom_token_left_idx=1,
+    pad_token_idx=0,
 )
 
 if model_parameters.nlayers == 4:
@@ -169,14 +174,14 @@
 accelerator.wait_for_everyone()
 shapes_dict = {model_args["name"]: (num_cells, num_genes)}
 run_eval(
-    adata = processed_adata,
-    name = model_args["name"],
-    pe_idx_path = model_args["pe_idx_path"],
-    chroms_path = model_args["chroms_path"],
-    starts_path = model_args["starts_path"],
-    shapes_dict = shapes_dict,
-    accelerator = accelerator,
-    args = model_parameters
+    adata=processed_adata,
+    name=model_args["name"],
+    pe_idx_path=model_args["pe_idx_path"],
+    chroms_path=model_args["chroms_path"],
+    starts_path=model_args["starts_path"],
+    shapes_dict=shapes_dict,
+    accelerator=accelerator,
+    args=model_parameters,
 )
 
 print("\n>>> Storing output...", flush=True)
@@ -196,7 +201,7 @@
 print(output)
 
 print("\n>>> Writing output AnnData to file...", flush=True)
-output.write_h5ad(par['output'], compression='gzip')
+output.write_h5ad(par["output"], compression="gzip")
 
 print("\n>>> Cleaning up temporary directories...", flush=True)
 work_dir.cleanup()