diff --git a/common b/common index 62268aab..b60eda08 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit 62268aab3ea7aeac754a9b134755b83d18e43975 +Subproject commit b60eda085e9cd505ec169fa30cc7e919e8563ad2 diff --git a/src/methods/geneformer/config.vsh.yaml b/src/methods/geneformer/config.vsh.yaml index 6be7998c..a98ab008 100644 --- a/src/methods/geneformer/config.vsh.yaml +++ b/src/methods/geneformer/config.vsh.yaml @@ -21,20 +21,24 @@ info: method_types: [embedding] variants: geneformer_12L_95M_i4096: - model: "gf-12L-95M-i4096" + model: gf-12L-95M-i4096 geneformer_6L_30M_i2048: - model: "gf-6L-30M-i2048" + model: gf-6L-30M-i2048 geneformer_12L_30M_i2048: - model: "gf-12L-30M-i2048" + model: gf-12L-30M-i2048 geneformer_20L_95M_i4096: - model: "gf-20L-95M-i4096" + model: gf-20L-95M-i4096 arguments: - - name: "--model" - type: "string" + - name: --model + type: string description: String representing the Geneformer model to use - choices: ["gf-6L-30M-i2048", "gf-12L-30M-i2048", "gf-12L-95M-i4096", "gf-20L-95M-i4096"] - default: "gf-12L-95M-i4096" + choices: + - gf-6L-30M-i2048 + - gf-12L-30M-i2048 + - gf-12L-95M-i4096 + - gf-20L-95M-i4096 + default: gf-12L-95M-i4096 resources: - type: python_script @@ -48,9 +52,9 @@ engines: setup: - type: python pip: - - pyarrow<15.0.0a0,>=14.0.1 - - huggingface_hub - - git+https://huggingface.co/ctheodoris/Geneformer.git + - pyarrow<15.0.0a0,>=14.0.1 + - huggingface_hub + - git+https://huggingface.co/ctheodoris/Geneformer.git runners: - type: executable diff --git a/src/methods/geneformer/script.py b/src/methods/geneformer/script.py index 521b8f5c..902a7735 100644 --- a/src/methods/geneformer/script.py +++ b/src/methods/geneformer/script.py @@ -23,22 +23,24 @@ print(">>> Reading input...", flush=True) sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata from exit_codes import exit_non_applicable +from read_anndata_partial import read_anndata adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns") if adata.uns["dataset_organism"] != "homo_sapiens": exit_non_applicable( f"Geneformer can only be used with human data " - f"(dataset_organism == \"{adata.uns['dataset_organism']}\")" + f'(dataset_organism == "{adata.uns["dataset_organism"]}")' ) # Set adata.var_names to gene IDs adata.var_names = adata.var["feature_id"] is_ensembl = all(var_name.startswith("ENSG") for var_name in adata.var_names) if not is_ensembl: - raise ValueError(f"Geneformer requires adata.var_names to contain ENSEMBL gene ids") + exit_non_applicable( + "Geneformer requires adata.var_names to contain ENSEMBL gene ids" + ) print(f">>> Getting settings for model '{par['model']}'...", flush=True) model_split = par["model"].split("-") @@ -97,18 +99,42 @@ adata.write_h5ad(os.path.join(input_dir, "input.h5ad")) print(adata) + +# Function to try parallel execution and fall batch to a single processor if it fails +def tryParallelFunction(fun, label): + try: + fun(nproc=n_processors) + except RuntimeError as e: + # Retry with nproc=1 if error message contains "One of the subprocesses has abruptly died" + if "subprocess" in str(e) and "died" in str(e): + print(f"{label} failed. Error message: {e}", flush=True) + print("Retrying with nproc=1", flush=True) + fun(nproc=1) + else: + raise e + + print(">>> Tokenizing data...", flush=True) special_token = model_details["dataset"] == "95M" print(f"Input size: {model_details['input_size']}, Special token: {special_token}") -tokenizer = TranscriptomeTokenizer( - nproc=n_processors, - model_input_size=model_details["input_size"], - special_token=special_token, - gene_median_file=dictionary_files["gene_median"], - token_dictionary_file=dictionary_files["token"], - gene_mapping_file=dictionary_files["ensembl_mapping"], -) -tokenizer.tokenize_data(input_dir, tokenized_dir, "tokenized", file_format="h5ad") + + +def tokenize_data(nproc): + tokenizer = TranscriptomeTokenizer( + nproc=nproc, + model_input_size=model_details["input_size"], + special_token=special_token, + gene_median_file=dictionary_files["gene_median"], + token_dictionary_file=dictionary_files["token"], + gene_mapping_file=dictionary_files["ensembl_mapping"], + ) + + tokenizer.tokenize_data(input_dir, tokenized_dir, "tokenized", file_format="h5ad") + + return tokenizer + + +tokenizer = tryParallelFunction(tokenize_data, "Tokenizing data") print(f">>> Getting model files for model '{par['model']}'...", flush=True) model_files = { diff --git a/src/methods/scgpt_finetuned/config.vsh.yaml b/src/methods/scgpt_finetuned/config.vsh.yaml index 6b055337..1054e496 100644 --- a/src/methods/scgpt_finetuned/config.vsh.yaml +++ b/src/methods/scgpt_finetuned/config.vsh.yaml @@ -44,6 +44,7 @@ resources: path: script.py - path: /src/utils/read_anndata_partial.py - path: scgpt_functions.py + - path: /src/utils/exit_codes.py engines: - type: docker diff --git a/src/methods/scgpt_finetuned/script.py b/src/methods/scgpt_finetuned/script.py index 7bee08bf..7123bd33 100644 --- a/src/methods/scgpt_finetuned/script.py +++ b/src/methods/scgpt_finetuned/script.py @@ -30,6 +30,7 @@ sys.path.append(meta["resources_dir"]) from read_anndata_partial import read_anndata +from exit_codes import exit_non_applicable from scgpt_functions import evaluate, prepare_data, prepare_dataloader, train print(f"====== scGPT version {scgpt.__version__} ======", flush=True) @@ -39,7 +40,7 @@ adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns") if adata.uns["dataset_organism"] != "homo_sapiens": - raise ValueError( + exit_non_applicable( f"scGPT can only be used with human data " f"(dataset_organism == \"{adata.uns['dataset_organism']}\")" ) diff --git a/src/methods/scprint/config.vsh.yaml b/src/methods/scprint/config.vsh.yaml index b3ce9d4b..1f2cad83 100644 --- a/src/methods/scprint/config.vsh.yaml +++ b/src/methods/scprint/config.vsh.yaml @@ -1,4 +1,4 @@ -__merge__: /src/api/base_method.yaml +__merge__: /src/api/comp_method.yaml name: scprint label: scPRINT @@ -38,6 +38,11 @@ info: model_name: "medium" scprint_small: model_name: "small" + test_setup: + run: + model_name: small + batch_size: 16 + max_len: 100 arguments: - name: "--model_name" @@ -49,6 +54,14 @@ arguments: type: file description: Path to the scPRINT model. required: false + - name: --batch_size + type: integer + description: The size of the batches to be used in the DataLoader. + default: 64 + - name: --max_len + type: integer + description: The maximum length of the gene sequence. + default: 4000 resources: - type: python_script @@ -79,4 +92,4 @@ runners: - type: executable - type: nextflow directives: - label: [hightime, midmem, midcpu, gpu] + label: [hightime, midmem, midcpu, gpu, midsharedmem] diff --git a/src/methods/scprint/script.py b/src/methods/scprint/script.py index e76d6f39..5f0c95e8 100644 --- a/src/methods/scprint/script.py +++ b/src/methods/scprint/script.py @@ -1,12 +1,13 @@ -import anndata as ad -from scdataloader import Preprocessor +import os import sys -from huggingface_hub import hf_hub_download -from scprint.tasks import Embedder -from scprint import scPrint + +import anndata as ad import scprint import torch -import os +from huggingface_hub import hf_hub_download +from scdataloader import Preprocessor +from scprint import scPrint +from scprint.tasks import Embedder ## VIASH START par = { @@ -19,8 +20,8 @@ ## VIASH END sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata from exit_codes import exit_non_applicable +from read_anndata_partial import read_anndata print(f"====== scPRINT version {scprint.__version__} ======", flush=True) @@ -41,7 +42,7 @@ print("\n>>> Preprocessing data...", flush=True) preprocessor = Preprocessor( - min_valid_genes_id=min(0.9 * adata.n_vars, 10000), # 90% of features up to 10,000 + min_valid_genes_id=min(0.9 * adata.n_vars, 10000), # 90% of features up to 10,000 # Turn off cell filtering to return results for all cells filter_cell_by_counts=False, min_nnz_genes=False, @@ -77,7 +78,8 @@ print(f"Using {n_cores_available} worker cores") embedder = Embedder( how="random expr", - max_len=4000, + batch_size=par["batch_size"], + max_len=par["max_len"], add_zero_genes=0, num_workers=n_cores_available, doclass=False, diff --git a/src/metrics/asw_label/config.vsh.yaml b/src/metrics/asw_label/config.vsh.yaml index 85835857..0392e4e3 100644 --- a/src/metrics/asw_label/config.vsh.yaml +++ b/src/metrics/asw_label/config.vsh.yaml @@ -38,4 +38,4 @@ runners: - type: executable - type: nextflow directives: - label: [midtime, midmem, lowcpu] + label: [hightime, midmem, lowcpu] diff --git a/src/metrics/isolated_label_asw/config.vsh.yaml b/src/metrics/isolated_label_asw/config.vsh.yaml index 23b3816b..799aa4ed 100644 --- a/src/metrics/isolated_label_asw/config.vsh.yaml +++ b/src/metrics/isolated_label_asw/config.vsh.yaml @@ -39,4 +39,4 @@ runners: - type: executable - type: nextflow directives: - label: [midtime, midmem, lowcpu] + label: [hightime, midmem, lowcpu] diff --git a/src/metrics/kbet/config.vsh.yaml b/src/metrics/kbet/config.vsh.yaml index bbbc6124..8f83724e 100644 --- a/src/metrics/kbet/config.vsh.yaml +++ b/src/metrics/kbet/config.vsh.yaml @@ -58,4 +58,4 @@ runners: - type: executable - type: nextflow directives: - label: [hightime, highmem, lowcpu] + label: [hightime, veryhighmem, lowcpu] diff --git a/src/metrics/kbet/script.py b/src/metrics/kbet/script.py index 89bd799e..e5ecc270 100644 --- a/src/metrics/kbet/script.py +++ b/src/metrics/kbet/script.py @@ -30,7 +30,7 @@ type_="embed", embed="X_emb", scaled=True, - verbose=False, + verbose=True, ) print(score, flush=True)