From 4f5ba9506db3e8f50ee5d1bcb65e09a374c1354b Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 30 Oct 2024 12:08:49 +0100 Subject: [PATCH 1/9] Create UCE component files --- src/methods/uce/config.vsh.yaml | 42 +++++++++++++++++++++++++++++++++ src/methods/uce/script.py | 31 ++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 src/methods/uce/config.vsh.yaml create mode 100644 src/methods/uce/script.py diff --git a/src/methods/uce/config.vsh.yaml b/src/methods/uce/config.vsh.yaml new file mode 100644 index 00000000..fa159578 --- /dev/null +++ b/src/methods/uce/config.vsh.yaml @@ -0,0 +1,42 @@ +__merge__: ../../api/comp_method.yaml + +name: uce +label: UCE +summary: UCE offers a unified biological latent space that can represent any cell +description: | + Universal Cell Embedding (UCE) is a single-cell foundation model that offers a + unified biological latent space that can represent any cell, regardless of + tissue or species +references: + doi: + - 10.1101/2023.11.28.568918 +links: + documentation: https://github.com/snap-stanford/UCE/blob/main/README.md + repository: https://github.com/snap-stanford/UCE + +info: + method_types: [embedding] + preferred_normalization: counts + +# Component-specific parameters (optional) +# arguments: +# - name: "--n_neighbors" +# type: "integer" +# default: 5 +# description: Number of neighbors to use. + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_pytorch_nvidia:1.0.0 + setup: + - type: docker + run: "git clone https://github.com/snap-stanford/UCE.git" +runners: + - type: executable + - type: nextflow + directives: + label: [midtime,midmem,midcpu,gpu] diff --git a/src/methods/uce/script.py b/src/methods/uce/script.py new file mode 100644 index 00000000..92780f93 --- /dev/null +++ b/src/methods/uce/script.py @@ -0,0 +1,31 @@ +import anndata as ad + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + 'input': 'resources_test/.../input.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'name': 'my_python_method' +} +## VIASH END + +print('Reading input files', flush=True) +input = ad.read_h5ad(par['input']) + +print('Preprocess data', flush=True) +# ... preprocessing ... + +print('Train model', flush=True) +# ... train model ... + +print('Generate predictions', flush=True) +# ... generate predictions ... + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + +) +output.write_h5ad(par['output'], compression='gzip') From 68fa6192e33c22f9586c43a60bc3a4b60123fd5a Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 30 Oct 2024 15:44:53 +0100 Subject: [PATCH 2/9] Add UCE dataset preprocessing --- src/methods/uce/config.vsh.yaml | 19 +++--- src/methods/uce/script.py | 104 +++++++++++++++++++++++++++++--- 2 files changed, 106 insertions(+), 17 deletions(-) diff --git a/src/methods/uce/config.vsh.yaml b/src/methods/uce/config.vsh.yaml index fa159578..2301ef51 100644 --- a/src/methods/uce/config.vsh.yaml +++ b/src/methods/uce/config.vsh.yaml @@ -1,4 +1,4 @@ -__merge__: ../../api/comp_method.yaml +__merge__: ../../api/base_method.yaml name: uce label: UCE @@ -18,25 +18,28 @@ info: method_types: [embedding] preferred_normalization: counts -# Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. +arguments: + - name: --model + type: file + description: Path to the directory containing UCE model files or a .zip/.tar.gz archive + required: true resources: - type: python_script path: script.py + - path: /src/utils/read_anndata_partial.py engines: - type: docker image: openproblems/base_pytorch_nvidia:1.0.0 setup: + - type: python + pypi: + - accelerate==0.24.0 - type: docker run: "git clone https://github.com/snap-stanford/UCE.git" runners: - type: executable - type: nextflow directives: - label: [midtime,midmem,midcpu,gpu] + label: [midtime, midmem, midcpu, gpu] diff --git a/src/methods/uce/script.py b/src/methods/uce/script.py index 92780f93..9c7261b2 100644 --- a/src/methods/uce/script.py +++ b/src/methods/uce/script.py @@ -1,25 +1,104 @@ +import sys +import tempfile +import os +import zipfile +import tarfile +import pandas as pd +import numpy as np + +from accelerate import Accelerator + import anndata as ad +os.chdir("UCE") +sys.path.append(".") +from data_proc.data_utils import process_raw_anndata + ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes # in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. par = { - 'input': 'resources_test/.../input.h5ad', - 'output': 'output.h5ad' + "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", + "output": "output.h5ad", } meta = { - 'name': 'my_python_method' + 'name': 'uce' } ## VIASH END -print('Reading input files', flush=True) -input = ad.read_h5ad(par['input']) -print('Preprocess data', flush=True) -# ... preprocessing ... +print(">>> Reading input...", flush=True) +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + +adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns") + +print("\n>>> Creating working directory...", flush=True) +work_dir = tempfile.TemporaryDirectory() +print(f"Working directory: '{work_dir.name}'", flush=True) -print('Train model', flush=True) -# ... train model ... +print("\n>>> Getting model files...", flush=True) +if os.path.isdir(par["model"]): + model_temp = None + model_dir = par["model"] +else: + model_temp = tempfile.TemporaryDirectory() + model_dir = model_temp.name + + if zipfile.is_zipfile(par["model"]): + print("Extracting UCE model from .zip...", flush=True) + with zipfile.ZipFile(par["model"], "r") as zip_file: + zip_file.extractall(model_dir) + elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"): + print("Extracting model from .tar.gz...", flush=True) + with tarfile.open(par["model"], "r:gz") as tar_file: + tar_file.extractall(model_dir) + model_dir = os.path.join(model_dir, os.listdir(model_dir)[0]) + else: + raise ValueError( + f"The 'model' argument should be a directory a .zip file or a .tar.gz file" + ) + +print("Extracting protein embeddings...", flush=True) +with tarfile.open(os.path.join(model_dir, "protein_embeddings.tar.gz"), "r:gz") as tar_file: + tar_file.extractall("./model_files") +print(f"Model directory: '{model_dir}'", flush=True) + +model_args = { + "dir" : work_dir.name, + "skip" : True, + "filter" : False # Turn this off to get embedding for all cells +} + +accelerator = Accelerator(model_args["dir"]) + +print("\n>>> Preprocessing data...", flush=True) +# Set var names to gene symbols +adata.var_names = adata.var["feature_name"] +adata.write_h5ad(os.path.join(model_args["dir"], "input.h5ad")) + +row = pd.Series() +row.path = "input.h5ad" +row.covar_col = np.nan +if adata.uns["dataset_organism"] == "homo_sapiens": + row.species = "human" +elif adata.uns["dataset_organism"] == "mus_musculus": + row.species = "mouse" +else: + raise ValueError(f"Species '{adata.uns['dataset_organism']} not yet implemented") + +processed_adata, num_cells, num_genes = process_raw_anndata( + row = row, + h5_folder_path = model_args["dir"], + npz_folder_path = model_args["dir"], + scp = "", + skip = model_args["skip"], + additional_filter = model_args["filter"], + root = model_args["dir"] +) + +# processor.generate_idxs() +# processor.run_evaluation() print('Generate predictions', flush=True) # ... generate predictions ... @@ -29,3 +108,10 @@ ) output.write_h5ad(par['output'], compression='gzip') + +print("\n>>> Cleaning up temporary directories...", flush=True) +work_dir.cleanup() +if model_temp is not None: + model_temp.cleanup() + +print("\n>>> Done!", flush=True) From 02a395f3e10a1fad6eac58bf44f76f7a2bd72490 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 30 Oct 2024 16:18:25 +0100 Subject: [PATCH 3/9] Generate UCE indexes --- src/methods/uce/script.py | 54 +++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/src/methods/uce/script.py b/src/methods/uce/script.py index 9c7261b2..628ac394 100644 --- a/src/methods/uce/script.py +++ b/src/methods/uce/script.py @@ -5,6 +5,8 @@ import tarfile import pandas as pd import numpy as np +import pickle +import torch from accelerate import Accelerator @@ -12,7 +14,7 @@ os.chdir("UCE") sys.path.append(".") -from data_proc.data_utils import process_raw_anndata +from data_proc.data_utils import process_raw_anndata, get_species_to_pe, get_spec_chrom_csv, adata_path_to_prot_chrom_starts ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes @@ -22,7 +24,7 @@ "output": "output.h5ad", } meta = { - 'name': 'uce' + 'name': 'uce' } ## VIASH END @@ -33,6 +35,13 @@ adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns") +if adata.uns["dataset_organism"] == "homo_sapiens": + species = "human" +elif adata.uns["dataset_organism"] == "mus_musculus": + species = "mouse" +else: + raise ValueError(f"Species '{adata.uns['dataset_organism']} not yet implemented") + print("\n>>> Creating working directory...", flush=True) work_dir = tempfile.TemporaryDirectory() print(f"Working directory: '{work_dir.name}'", flush=True) @@ -59,19 +68,31 @@ f"The 'model' argument should be a directory a .zip file or a .tar.gz file" ) +print(f"Model directory: '{model_dir}'", flush=True) + print("Extracting protein embeddings...", flush=True) with tarfile.open(os.path.join(model_dir, "protein_embeddings.tar.gz"), "r:gz") as tar_file: tar_file.extractall("./model_files") -print(f"Model directory: '{model_dir}'", flush=True) +protein_embeddings_dir = os.path.join("./model_files", "protein_embeddings") +print(f"Protein embeddings directory: '{protein_embeddings_dir}'", flush=True) +# The following sections implement methods in the UCE.evaluate.AnndataProcessor +# class due to the object not being compatible with the Open Problems setup model_args = { "dir" : work_dir.name, "skip" : True, - "filter" : False # Turn this off to get embedding for all cells + "filter" : False, # Turn this off to get embedding for all cells + "name" : "input", + "offset_pkl_path" : os.path.join(model_dir, "species_offsets.pkl"), + "spec_chrom_csv_path" : os.path.join(model_dir, "species_chrom.csv"), + "pe_idx_path" : os.path.join(work_dir.name, "input_pe_row_idxs.pt"), + "chroms_path" : os.path.join(work_dir.name, "input_chroms.pkl"), + "starts_path" : os.path.join(work_dir.name, "input_starts.pkl"), } accelerator = Accelerator(model_args["dir"]) +# AnndataProcessor.preprocess_anndata() print("\n>>> Preprocessing data...", flush=True) # Set var names to gene symbols adata.var_names = adata.var["feature_name"] @@ -80,12 +101,7 @@ row = pd.Series() row.path = "input.h5ad" row.covar_col = np.nan -if adata.uns["dataset_organism"] == "homo_sapiens": - row.species = "human" -elif adata.uns["dataset_organism"] == "mus_musculus": - row.species = "mouse" -else: - raise ValueError(f"Species '{adata.uns['dataset_organism']} not yet implemented") +row.species = species processed_adata, num_cells, num_genes = process_raw_anndata( row = row, @@ -97,7 +113,23 @@ root = model_args["dir"] ) -# processor.generate_idxs() +# AnndataProcessor.generate_idxs() +print("\n>>> Generating indexes...", flush=True) +species_to_pe = get_species_to_pe(protein_embeddings_dir) +with open(model_args["offset_pkl_path"], "rb") as f: + species_to_offsets = pickle.load(f) +gene_to_chrom_pos = get_spec_chrom_csv(model_args["spec_chrom_csv_path"]) +spec_pe_genes = list(species_to_pe[species].keys()) +offset = species_to_offsets[species] +pe_row_idxs, dataset_chroms, dataset_pos = adata_path_to_prot_chrom_starts( + processed_adata, species, spec_pe_genes, gene_to_chrom_pos, offset +) +torch.save({model_args["name"]: pe_row_idxs}, model_args["pe_idx_path"]) +with open(model_args["chroms_path"], "wb+") as f: + pickle.dump({model_args["name"]: dataset_chroms}, f) +with open(model_args["starts_path"], "wb+") as f: + pickle.dump({model_args["name"]: dataset_pos}, f) + # processor.run_evaluation() print('Generate predictions', flush=True) From 3332f0273a14060a1f6619bba8eac70d17b7133a Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Thu, 31 Oct 2024 08:37:32 +0100 Subject: [PATCH 4/9] Evaluate UCE model and output results --- src/methods/uce/script.py | 65 ++++++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/src/methods/uce/script.py b/src/methods/uce/script.py index 628ac394..b70d521c 100644 --- a/src/methods/uce/script.py +++ b/src/methods/uce/script.py @@ -7,14 +7,17 @@ import numpy as np import pickle import torch +from argparse import Namespace from accelerate import Accelerator import anndata as ad +# Code has hardcoded paths that only work correctly inside the UCE directory os.chdir("UCE") sys.path.append(".") from data_proc.data_utils import process_raw_anndata, get_species_to_pe, get_spec_chrom_csv, adata_path_to_prot_chrom_starts +from evaluate import run_eval ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes @@ -79,7 +82,7 @@ # The following sections implement methods in the UCE.evaluate.AnndataProcessor # class due to the object not being compatible with the Open Problems setup model_args = { - "dir" : work_dir.name, + "dir" : work_dir.name + "/", "skip" : True, "filter" : False, # Turn this off to get embedding for all cells "name" : "input", @@ -90,8 +93,6 @@ "starts_path" : os.path.join(work_dir.name, "input_starts.pkl"), } -accelerator = Accelerator(model_args["dir"]) - # AnndataProcessor.preprocess_anndata() print("\n>>> Preprocessing data...", flush=True) # Set var names to gene symbols @@ -130,15 +131,63 @@ with open(model_args["starts_path"], "wb+") as f: pickle.dump({model_args["name"]: dataset_pos}, f) -# processor.run_evaluation() +# AnndataProcessor.run_evaluation() +print("\n>>> Evaluating model...", flush=True) +model_parameters = Namespace( + token_dim = 5120, + d_hid = 5120, + nlayers = 4, # Small model = 4, full model = 33 + output_dim = 1280, + multi_gpu= False, + token_file = os.path.join(model_dir, "all_tokens.torch"), + dir = model_args["dir"], + pad_length = 1536, + sample_size = 1024, + cls_token_idx = 3, + CHROM_TOKEN_OFFSET = 143574, + chrom_token_right_idx = 2, + chrom_token_left_idx = 1, + pad_token_idx = 0 +) -print('Generate predictions', flush=True) -# ... generate predictions ... +if model_parameters.nlayers == 4: + model_parameters.model_loc = os.path.join(model_dir, "4layer_model.torch") + model_parameters.batch_size = 100 +else: + model_parameters.model_loc = os.path.join(model_dir, "33l_8ep_1024t_1280.torch") + model_parameters.batch_size = 25 + +accelerator = Accelerator(project_dir=model_args["dir"]) +accelerator.wait_for_everyone() +shapes_dict = {model_args["name"]: (num_cells, num_genes)} +run_eval( + adata = processed_adata, + name = model_args["name"], + pe_idx_path = model_args["pe_idx_path"], + chroms_path = model_args["chroms_path"], + starts_path = model_args["starts_path"], + shapes_dict = shapes_dict, + accelerator = accelerator, + args = model_parameters +) -print("Write output AnnData to file", flush=True) +print("\n>>> Storing output...", flush=True) +embedded_adata = ad.read_h5ad(os.path.join(model_args["dir"], "input_uce_adata.h5ad")) output = ad.AnnData( - + obs=adata.obs[[]], + var=adata.var[[]], + obsm={ + "X_emb": embedded_adata.obsm["X_uce"], + }, + uns={ + "dataset_id": adata.uns["dataset_id"], + "normalization_id": adata.uns["normalization_id"], + "method_id": meta["name"], + }, ) +print(output) + +print("\n>>> Writing output AnnData to file...", flush=True) output.write_h5ad(par['output'], compression='gzip') print("\n>>> Cleaning up temporary directories...", flush=True) From 7716426d7dbe7cfe98c13634831441f26eec9089 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Thu, 31 Oct 2024 09:57:21 +0100 Subject: [PATCH 5/9] Add UCE to benchmark workflow --- src/workflows/run_benchmark/config.vsh.yaml | 1 + src/workflows/run_benchmark/main.nf | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 51e482ab..222361d9 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -95,6 +95,7 @@ dependencies: - name: methods/scanvi - name: methods/scimilarity - name: methods/scvi + - name: methods/uce # metrics - name: metrics/asw_batch - name: metrics/asw_label diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 69322a1a..c7360cee 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -31,7 +31,10 @@ methods = [ scimilarity.run( args: [model: file("s3://openproblems-work/cache/scimilarity-model_v1.1.tar.gz")] ), - scvi + scvi, + uce.run( + args: [model: file("s3://openproblems-work/cache/scimilarity-model_v1.1.tar.gz")] + ), ] // construct list of metrics From acc283b39666f4c1449271866be504e365094b7f Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Thu, 31 Oct 2024 10:15:33 +0100 Subject: [PATCH 6/9] Fix UCE model path in benchmark workflow --- src/workflows/run_benchmark/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index c7360cee..8ef69c6b 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -33,7 +33,7 @@ methods = [ ), scvi, uce.run( - args: [model: file("s3://openproblems-work/cache/scimilarity-model_v1.1.tar.gz")] + args: [model: file("s3://openproblems-work/cache/uce-model-v5.zip")] ), ] From cd14b3a4bfae5883a676d9966f4d313cf1177098 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Thu, 31 Oct 2024 11:16:11 +0100 Subject: [PATCH 7/9] Copy UCE files to working directory for Nextflow --- src/methods/uce/script.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/methods/uce/script.py b/src/methods/uce/script.py index b70d521c..658d54aa 100644 --- a/src/methods/uce/script.py +++ b/src/methods/uce/script.py @@ -14,7 +14,15 @@ import anndata as ad # Code has hardcoded paths that only work correctly inside the UCE directory -os.chdir("UCE") +if os.path.isdir("UCE"): + # For executable we can work inside the UCE directory + os.chdir("UCE") +else: + # For Nextflow we need to copy files to the Nextflow working directory + print(">>> Copying UCE files to local directory...", flush=True) + import shutil + shutil.copytree("/workspace/UCE", ".", dirs_exist_ok=True) + sys.path.append(".") from data_proc.data_utils import process_raw_anndata, get_species_to_pe, get_spec_chrom_csv, adata_path_to_prot_chrom_starts from evaluate import run_eval From ab5bb0751987a98f4c0432ebe6a008cea1dcc23c Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Thu, 31 Oct 2024 11:54:57 +0100 Subject: [PATCH 8/9] Exclude UCE in local benchmark scripts Requires more memory than allowed by the local labels config --- scripts/run_benchmark/run_full_local.sh | 1 + scripts/run_benchmark/run_test_local.sh | 1 + src/methods/uce/script.py | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh index 5c83ddb3..d823d79e 100755 --- a/scripts/run_benchmark/run_full_local.sh +++ b/scripts/run_benchmark/run_full_local.sh @@ -26,6 +26,7 @@ input_states: resources/datasets/**/state.yaml rename_keys: 'input_dataset:output_dataset;input_solution:output_solution' output_state: "state.yaml" publish_dir: "$publish_dir" +settings: '{"methods_exclude": ["uce"]}' HERE # run the benchmark diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh index d0fba746..2b72eeed 100755 --- a/scripts/run_benchmark/run_test_local.sh +++ b/scripts/run_benchmark/run_test_local.sh @@ -21,6 +21,7 @@ input_states: resources_test/task_batch_integration/**/state.yaml rename_keys: 'input_dataset:output_dataset;input_solution:output_solution' output_state: "state.yaml" publish_dir: "$publish_dir" +settings: '{"methods_exclude": ["uce"]}' HERE nextflow run . \ diff --git a/src/methods/uce/script.py b/src/methods/uce/script.py index 658d54aa..314504a9 100644 --- a/src/methods/uce/script.py +++ b/src/methods/uce/script.py @@ -144,7 +144,7 @@ model_parameters = Namespace( token_dim = 5120, d_hid = 5120, - nlayers = 4, # Small model = 4, full model = 33 + nlayers = 33, # Small model = 4, full model = 33 output_dim = 1280, multi_gpu= False, token_file = os.path.join(model_dir, "all_tokens.torch"), From af5c1a9c377232bfebf9c3d0d2ffdffda97066bf Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Thu, 31 Oct 2024 12:02:32 +0100 Subject: [PATCH 9/9] Style UCE script --- src/methods/uce/script.py | 113 ++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 54 deletions(-) diff --git a/src/methods/uce/script.py b/src/methods/uce/script.py index 314504a9..24108a94 100644 --- a/src/methods/uce/script.py +++ b/src/methods/uce/script.py @@ -1,17 +1,16 @@ +import os +import pickle import sys +import tarfile import tempfile -import os import zipfile -import tarfile -import pandas as pd -import numpy as np -import pickle -import torch from argparse import Namespace -from accelerate import Accelerator - import anndata as ad +import numpy as np +import pandas as pd +import torch +from accelerate import Accelerator # Code has hardcoded paths that only work correctly inside the UCE directory if os.path.isdir("UCE"): @@ -21,10 +20,17 @@ # For Nextflow we need to copy files to the Nextflow working directory print(">>> Copying UCE files to local directory...", flush=True) import shutil + shutil.copytree("/workspace/UCE", ".", dirs_exist_ok=True) +# Append current directory to import UCE functions sys.path.append(".") -from data_proc.data_utils import process_raw_anndata, get_species_to_pe, get_spec_chrom_csv, adata_path_to_prot_chrom_starts +from data_proc.data_utils import ( + adata_path_to_prot_chrom_starts, + get_spec_chrom_csv, + get_species_to_pe, + process_raw_anndata, +) from evaluate import run_eval ## VIASH START @@ -34,12 +40,9 @@ "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", "output": "output.h5ad", } -meta = { - 'name': 'uce' -} +meta = {"name": "uce"} ## VIASH END - print(">>> Reading input...", flush=True) sys.path.append(meta["resources_dir"]) from read_anndata_partial import read_anndata @@ -51,7 +54,7 @@ elif adata.uns["dataset_organism"] == "mus_musculus": species = "mouse" else: - raise ValueError(f"Species '{adata.uns['dataset_organism']} not yet implemented") + raise ValueError(f"Species '{adata.uns['dataset_organism']}' not yet implemented") print("\n>>> Creating working directory...", flush=True) work_dir = tempfile.TemporaryDirectory() @@ -82,7 +85,9 @@ print(f"Model directory: '{model_dir}'", flush=True) print("Extracting protein embeddings...", flush=True) -with tarfile.open(os.path.join(model_dir, "protein_embeddings.tar.gz"), "r:gz") as tar_file: +with tarfile.open( + os.path.join(model_dir, "protein_embeddings.tar.gz"), "r:gz" +) as tar_file: tar_file.extractall("./model_files") protein_embeddings_dir = os.path.join("./model_files", "protein_embeddings") print(f"Protein embeddings directory: '{protein_embeddings_dir}'", flush=True) @@ -90,15 +95,15 @@ # The following sections implement methods in the UCE.evaluate.AnndataProcessor # class due to the object not being compatible with the Open Problems setup model_args = { - "dir" : work_dir.name + "/", - "skip" : True, - "filter" : False, # Turn this off to get embedding for all cells - "name" : "input", - "offset_pkl_path" : os.path.join(model_dir, "species_offsets.pkl"), - "spec_chrom_csv_path" : os.path.join(model_dir, "species_chrom.csv"), - "pe_idx_path" : os.path.join(work_dir.name, "input_pe_row_idxs.pt"), - "chroms_path" : os.path.join(work_dir.name, "input_chroms.pkl"), - "starts_path" : os.path.join(work_dir.name, "input_starts.pkl"), + "dir": work_dir.name + "/", + "skip": True, + "filter": False, # Turn this off to get embedding for all cells + "name": "input", + "offset_pkl_path": os.path.join(model_dir, "species_offsets.pkl"), + "spec_chrom_csv_path": os.path.join(model_dir, "species_chrom.csv"), + "pe_idx_path": os.path.join(work_dir.name, "input_pe_row_idxs.pt"), + "chroms_path": os.path.join(work_dir.name, "input_chroms.pkl"), + "starts_path": os.path.join(work_dir.name, "input_starts.pkl"), } # AnndataProcessor.preprocess_anndata() @@ -113,13 +118,13 @@ row.species = species processed_adata, num_cells, num_genes = process_raw_anndata( - row = row, - h5_folder_path = model_args["dir"], - npz_folder_path = model_args["dir"], - scp = "", - skip = model_args["skip"], - additional_filter = model_args["filter"], - root = model_args["dir"] + row=row, + h5_folder_path=model_args["dir"], + npz_folder_path=model_args["dir"], + scp="", + skip=model_args["skip"], + additional_filter=model_args["filter"], + root=model_args["dir"], ) # AnndataProcessor.generate_idxs() @@ -142,20 +147,20 @@ # AnndataProcessor.run_evaluation() print("\n>>> Evaluating model...", flush=True) model_parameters = Namespace( - token_dim = 5120, - d_hid = 5120, - nlayers = 33, # Small model = 4, full model = 33 - output_dim = 1280, - multi_gpu= False, - token_file = os.path.join(model_dir, "all_tokens.torch"), - dir = model_args["dir"], - pad_length = 1536, - sample_size = 1024, - cls_token_idx = 3, - CHROM_TOKEN_OFFSET = 143574, - chrom_token_right_idx = 2, - chrom_token_left_idx = 1, - pad_token_idx = 0 + token_dim=5120, + d_hid=5120, + nlayers=33, # Small model = 4, full model = 33 + output_dim=1280, + multi_gpu=False, + token_file=os.path.join(model_dir, "all_tokens.torch"), + dir=model_args["dir"], + pad_length=1536, + sample_size=1024, + cls_token_idx=3, + CHROM_TOKEN_OFFSET=143574, + chrom_token_right_idx=2, + chrom_token_left_idx=1, + pad_token_idx=0, ) if model_parameters.nlayers == 4: @@ -169,14 +174,14 @@ accelerator.wait_for_everyone() shapes_dict = {model_args["name"]: (num_cells, num_genes)} run_eval( - adata = processed_adata, - name = model_args["name"], - pe_idx_path = model_args["pe_idx_path"], - chroms_path = model_args["chroms_path"], - starts_path = model_args["starts_path"], - shapes_dict = shapes_dict, - accelerator = accelerator, - args = model_parameters + adata=processed_adata, + name=model_args["name"], + pe_idx_path=model_args["pe_idx_path"], + chroms_path=model_args["chroms_path"], + starts_path=model_args["starts_path"], + shapes_dict=shapes_dict, + accelerator=accelerator, + args=model_parameters, ) print("\n>>> Storing output...", flush=True) @@ -196,7 +201,7 @@ print(output) print("\n>>> Writing output AnnData to file...", flush=True) -output.write_h5ad(par['output'], compression='gzip') +output.write_h5ad(par["output"], compression="gzip") print("\n>>> Cleaning up temporary directories...", flush=True) work_dir.cleanup()