openproblems-bio · Kraftfahrzeughaftpflichtversicherung · Mar 16, 2025 · Mar 17, 2025 · Mar 17, 2025 · Mar 19, 2025
diff --git a/common b/common
diff --git a/src/datasets/loaders/nsclc_sc_zuani/config.vsh.yaml b/src/datasets/loaders/nsclc_sc_zuani/config.vsh.yaml
@@ -0,0 +1,64 @@
+name: nsclc_sc_zuani
+namespace: datasets/loaders
+
+argument_groups:
+  - name: Metadata
+    arguments:
+      - type: string
+        name: --dataset_id
+        description: "A unique identifier for the dataset"
+        required: false
+        default: "E-MTAB-13526" 
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: false
+        default: "E-MTAB-13526" 
+      - type: string
+        name: --dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+        default: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" 
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+        default: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" 
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: false
+        default: "We performed single cell RNA sequencing (scRNA-seq) of NSCLC tumours and matched, adjacent, non-involved lung tissue from 24 patients. The data set is composed of approximately 900,000 cells from two different populations: CD235- (haematopoietic and non-haematopoietic cells depleted of erythrocytes), and CD45+ (all haematopoietic cells)." 
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: false
+        default: "We performed single cell RNA sequencing (scRNA-seq) of NSCLC tumours and matched, adjacent, non-involved lung tissue from 24 patients. The data set is composed of approximately 900,000 cells from two different populations: CD235- (haematopoietic and non-haematopoietic cells depleted of erythrocytes), and CD45+ (all haematopoietic cells)." 
+      - name: --dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+        default: "human" 
+  - name: Outputs
+    arguments:
+    - name: "--output"
+      __merge__: /src/api/file_common_scrnaseq.yaml
+      direction: output
+      required: true
+
+resources:
+  - type: python_script
+    path: script.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    __merge__: 
+      - /src/base/setup_txsim_partial.yaml
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [veryhighmem, midcpu, midtime]
diff --git a/src/datasets/loaders/nsclc_sc_zuani/script.py b/src/datasets/loaders/nsclc_sc_zuani/script.py
@@ -0,0 +1,81 @@
+from pathlib import Path
+import pandas as pd
+import numpy as np
+from collections import defaultdict
+import anndata as ad
+import os
+import sys
+import scipy.sparse
+import scanpy
+
+## VIASH START
+
+par = {
+    "output": "EMTAB13526_Lung_sc.h5ad",
+    "dataset_id": "E-MTAB-13526",
+    "dataset_name": "E-MTAB-13526",
+    "dataset_url": "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526",
+    "dataset_reference": "https://doi.org/10.1038/s41467-024-48700-8", 
+    "dataset_summary": "This dataset contains scRNA-seq data from human lung cancer cells.",
+    "dataset_description": "This dataset contains scRNA-seq data from human lung cancer cells.",
+    "dataset_organism": "Homo sapiens"
+}
+
+meta = {
+    "temp_dir": "./tmp/nsclc_sc_zuani/",
+}
+
+## VIASH END
+
+
+#os.system(f'wget "ftp://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/10X_Lung_Tumour_Annotated_v2.h5ad" -P {meta["temp_dir"]}')
+#adata = ad.read_h5ad( f'{meta["temp_dir"]}10X_Lung_Tumour_Annotated_v2.h5ad', backed="r")
+
+
+TMP_DIR = Path(meta["temp_dir"] or "./tmp")
+TMP_DIR.mkdir(parents=True, exist_ok=True)
+FILE_PATHS = {"file": TMP_DIR / "cropped_sc.h5ad"}
+os.system(f'wget http://192.168.2.46:8000/file/cropped_sc.h5ad -P ./tmp/')
+adata = ad.read_h5ad( './tmp/cropped_sc.h5ad')
+
+genes_sum = adata.X.toarray().sum(0)
+adata = adata[:, genes_sum != 0]
+
+rename_obs_keys = {
+    "cell_type": 'Cell types'}
+adata.obs = adata.obs.rename(columns={old:new for new,old in rename_obs_keys.items()})
+
+
+
+store_info = { 
+    "dataset_id": "E-MTAB-13526",
+    "assay": "Unknown", 
+    "sex": "female, male", 
+    "tissue": "lung",
+    "disease": "lung adenocarcinoma, normal, non-small cell lung cancer, lung squamous cell carcinoma",
+    "organism": "Homo sapiens",
+    "tissue_general": "lung",
+    "development_stage": "adult", 
+}
+for key in ["dataset_id", "tissue", "organism", "tissue_general", "development_stage"]:
+    adata.obs[key] = pd.Categorical([store_info[key]] * adata.n_obs, categories=[store_info[key]])
+
+uns_info = { "dataset_id": "E-MTAB-13526" ,
+              "dataset_name":"E-MTAB-13526" , 
+              "dataset_url":"https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" ,
+              "dataset_reference": "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526",
+              "dataset_summary": 'none',
+              "dataset_description":'none',
+              "dataset_organism": 'Homo sapiens' 
+}
+
+for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]:
+    adata.uns[key] = uns_info[key]
-uns_info = { "dataset_id": "E-MTAB-13526" ,
-              "dataset_name":"E-MTAB-13526" , 
-              "dataset_url":"https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" ,
-              "dataset_reference": "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526",
-              "dataset_summary": 'none',
-              "dataset_description":'none',
-              "dataset_organism": 'Homo sapiens' 
-}
-
-for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]:
-    adata.uns[key] = uns_info[key]
+for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]:
+    adata.uns[key] = par[key]
-uns_info = { "dataset_id": "E-MTAB-13526" ,
-              "dataset_name":"E-MTAB-13526" , 
-              "dataset_url":"https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" ,
-              "dataset_reference": "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526",
-              "dataset_summary": 'none',
-              "dataset_description":'none',
-              "dataset_organism": 'Homo sapiens' 
-}
-
-for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]:
-    adata.uns[key] = uns_info[key]
+for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]:
+    adata.uns[key] = par[key]
+
+adata.var["gene_symbol"] = adata.var_names
+
+adata.layers['counts'] =  adata.X.toarray()
+del adata.X
+
+print("Writing adata", flush=True)
+adata.write_h5ad(par["output"], compression="gzip")
diff --git a/src/datasets/workflows/process_nsclc_sc_zuani/config.vsh.yaml b/src/datasets/workflows/process_nsclc_sc_zuani/config.vsh.yaml
@@ -0,0 +1,86 @@
+name: process_nsclc_sc_zuani
+namespace: datasets/workflows
+
+argument_groups:
-argument_groups:
+argument_groups:
+  - name: Inputs
+    arguments:
+      - type: file
+        name: --input
+        description: Path to the dataset
+        required: true
+        example: "https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/10X_Lung_Tumour_Annotated_v2.h5ad" 
-argument_groups:
+argument_groups:
+  - name: Inputs
+    arguments:
+      - type: file
+        name: --input
+        description: Path to the dataset
+        required: true
+        example: "https://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/10X_Lung_Tumour_Annotated_v2.h5ad" 
+  - name: Caching settings
+    arguments:
+      - type: boolean
+        name: --keep_files
+        required: false
+        description: Whether to remove the downloaded files after processing.
+        default: false
+  - name: Metadata
+    arguments:
+      - type: string
+        name: --dataset_id
+        description: "A unique identifier for the dataset"
+        required: false
+        default: "E-MTAB-13526" 
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: false
+        default: "E-MTAB-13526" 
+      - type: string
+        name: --dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+        default: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" 
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+        default: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" 
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+        default: "We performed single cell RNA sequencing (scRNA-seq) of NSCLC tumours and matched, adjacent, non-involved lung tissue from 24 patients. The data set is composed of approximately 900,000 cells from two different populations: CD235- (haematopoietic and non-haematopoietic cells depleted of erythrocytes), and CD45+ (all haematopoietic cells)." 
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+        default: "We performed single cell RNA sequencing (scRNA-seq) of NSCLC tumours and matched, adjacent, non-involved lung tissue from 24 patients. The data set is composed of approximately 900,000 cells from two different populations: CD235- (haematopoietic and non-haematopoietic cells depleted of erythrocytes), and CD45+ (all haematopoietic cells)." 
+      - name: --dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+        default: "human" 
+  - name: Outputs
+    arguments:
+    - name: "--output"
+      __merge__: /src/api/file_common_scrnaseq.yaml
+      direction: output
+      required: true
+      default: "$id/dataset.h5ad"
+    - name: "--output_meta"
+      direction: "output"
+      type: file
+      description: "Dataset metadata"
+      default: "$id/dataset_meta.yaml"
+
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+  - path: /common/nextflow_helpers/helper.nf
+
+dependencies:
+  - name: datasets/loaders/nsclc_sc_zuani
+#  - name: datasets/processors/subsample
+#    repository: openproblems
+  - name: datasets/normalization/log_cp
+    repository: openproblems
+  - name: datasets/processors/pca
+    repository: openproblems
+  - name: datasets/processors/hvg
+    repository: openproblems
+  - name: datasets/processors/knn
+    repository: openproblems
+  - name: h5ad/extract_uns_metadata
+    repository: core
+
+runners:
+  - type: nextflow
+    directives:
+      label: [midcpu, midmem, hightime]
diff --git a/src/datasets/workflows/process_nsclc_sc_zuani/main.nf b/src/datasets/workflows/process_nsclc_sc_zuani/main.nf
@@ -0,0 +1,96 @@
+def resources_dir = params.resources_dir
+def config = params.config
+
+include { findArgumentSchema } from "${meta.resources_dir}/helper.nf"
+
+workflow auto {
+  findStates(params, meta.config)
+    | meta.workflow.run(
+      auto: [publish: "state"]
+    )
+}
+
+workflow run_wf {
+  take:
+  input_ch
+
+  main:
+  output_ch = input_ch
+
+    // copy id to the state
+    | map{ id, state ->
+      def new_state = state + [dataset_id: id]
+      [id, new_state]
+    }
+
+    | nsclc_sc_zuani.run(
+      fromState: [
+        "cancer_subtypes",
+        "dataset_id",
+        "dataset_name",
+        "dataset_url",
+        "dataset_reference",
+        "dataset_summary",
+        "dataset_description",
+        "dataset_organism",
+      ],
+      toState: [
+        "output_raw": "output"
+      ]
+    )
+
+    | log_cp.run(
+      key: "log_cp10k",
+      fromState: [
+        "input": "output_raw"
+      ],
+      args: [
+        "normalization_id": "log_cp10k",
+        "n_cp": 10000
+      ],
+      toState: [
+        "output_normalized": "output"
+      ]
+    )
+    | hvg.run(
+      fromState: ["input": "output_normalized"],
+      toState: ["output_hvg": "output"]
+    )
+
+    | pca.run(
+      fromState: ["input": "output_hvg"],
+      toState: ["output_pca": "output" ]
+    )
+
+    | knn.run(
+      fromState: ["input": "output_pca"],
+      toState: ["output_knn": "output"]
+    )
+    // add synonym
+    | map{ id, state ->
+      [id, state + [output_dataset: state.output_knn]]
+    }
+
+    | extract_uns_metadata.run(
+      fromState: { id, state ->
+        def schema = findArgumentSchema(meta.config, "output_dataset")
+        // workaround: convert GString to String
+        schema = iterateMap(schema, { it instanceof GString ? it.toString() : it })
+        def schemaYaml = tempFile("schema.yaml")
+        writeYaml(schema, schemaYaml)
+        [
+          "input": state.output_dataset,
+          "schema": schemaYaml
+        ]
+      },
+      toState: ["output_meta": "output"]
+    )
+
+    | setState([
+      "output_dataset": "output_dataset",
+      "output_meta": "output_meta"
+    ])
+
+  emit:
+  output_ch
+}
diff --git a/src/datasets/workflows/process_nsclc_sc_zuani/test.sh b/src/datasets/workflows/process_nsclc_sc_zuani/test.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+set -e
+
+# Create local output directory
+output_dir="resources/datasets"
+
+if [ ! -d "$output_dir" ]; then
+  mkdir -p "$output_dir"
+fi
+
+cat > /tmp/params.yaml << HERE
+param_list:
+  - id: process_nsclc_sc_zuani/process_nsclc_sc_zuani
+
+keep_files: false 
+
+output_dataset: "\$id/dataset.h5ad"
+output_meta: "\$id/dataset_meta.yaml"
+output_state: "\$id/state.yaml"
+publish_dir: "$output_dir"
+HERE
+
+# Run nextflow workflow locally
+nextflow run . \
+  -main-script target/nextflow/datasets/workflows/process_nsclc_sc_zuani/main.nf \
+  -params-file /tmp/params.yaml \
+  -profile docker \
+  -c common/nextflow_helpers/labels_ci.config
+3 −3		component_tests/run_and_check_output.py
+0 −21		nextflow_helpers/README.md
+0 −232		nextflow_helpers/benchmarkHelper.nf
+9 −47		nextflow_helpers/labels_tw.config
+0 −2,786		nextflow_helpers/workflowHelper.nf
+3 −3		scripts/create_component
+4 −4		scripts/create_task_readme
+3 −3		scripts/sync_resources