-
Notifications
You must be signed in to change notification settings - Fork 3
Dataset sc lung #27
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Dataset sc lung #27
Changes from all commits
3c53e8c
94a85e8
ba0c4dd
7a7f359
7390de3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
+3 −3 | component_tests/run_and_check_output.py | |
+0 −21 | nextflow_helpers/README.md | |
+0 −232 | nextflow_helpers/benchmarkHelper.nf | |
+9 −47 | nextflow_helpers/labels_tw.config | |
+0 −2,786 | nextflow_helpers/workflowHelper.nf | |
+3 −3 | scripts/create_component | |
+4 −4 | scripts/create_task_readme | |
+3 −3 | scripts/sync_resources |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
name: nsclc_sc_zuani | ||
namespace: datasets/loaders | ||
|
||
argument_groups: | ||
- name: Metadata | ||
arguments: | ||
- type: string | ||
name: --dataset_id | ||
description: "A unique identifier for the dataset" | ||
required: false | ||
default: "E-MTAB-13526" | ||
- name: --dataset_name | ||
type: string | ||
description: Nicely formatted name. | ||
required: false | ||
default: "E-MTAB-13526" | ||
- type: string | ||
name: --dataset_url | ||
description: Link to the original source of the dataset. | ||
required: false | ||
default: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" | ||
- name: --dataset_reference | ||
type: string | ||
description: Bibtex reference of the paper in which the dataset was published. | ||
required: false | ||
default: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" | ||
- name: --dataset_summary | ||
type: string | ||
description: Short description of the dataset. | ||
required: false | ||
default: "We performed single cell RNA sequencing (scRNA-seq) of NSCLC tumours and matched, adjacent, non-involved lung tissue from 24 patients. The data set is composed of approximately 900,000 cells from two different populations: CD235- (haematopoietic and non-haematopoietic cells depleted of erythrocytes), and CD45+ (all haematopoietic cells)." | ||
- name: --dataset_description | ||
type: string | ||
description: Long description of the dataset. | ||
required: false | ||
default: "We performed single cell RNA sequencing (scRNA-seq) of NSCLC tumours and matched, adjacent, non-involved lung tissue from 24 patients. The data set is composed of approximately 900,000 cells from two different populations: CD235- (haematopoietic and non-haematopoietic cells depleted of erythrocytes), and CD45+ (all haematopoietic cells)." | ||
- name: --dataset_organism | ||
type: string | ||
description: The organism of the sample in the dataset. | ||
required: false | ||
default: "human" | ||
- name: Outputs | ||
arguments: | ||
- name: "--output" | ||
__merge__: /src/api/file_common_scrnaseq.yaml | ||
direction: output | ||
required: true | ||
|
||
resources: | ||
- type: python_script | ||
path: script.py | ||
|
||
engines: | ||
- type: docker | ||
image: openproblems/base_python:1.0.0 | ||
__merge__: | ||
- /src/base/setup_txsim_partial.yaml | ||
- type: native | ||
|
||
runners: | ||
- type: executable | ||
- type: nextflow | ||
directives: | ||
label: [veryhighmem, midcpu, midtime] |
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,81 @@ | ||||||||||||||||||||||||||||
from pathlib import Path | ||||||||||||||||||||||||||||
import pandas as pd | ||||||||||||||||||||||||||||
import numpy as np | ||||||||||||||||||||||||||||
from collections import defaultdict | ||||||||||||||||||||||||||||
import anndata as ad | ||||||||||||||||||||||||||||
import os | ||||||||||||||||||||||||||||
import sys | ||||||||||||||||||||||||||||
import scipy.sparse | ||||||||||||||||||||||||||||
import scanpy | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
## VIASH START | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
par = { | ||||||||||||||||||||||||||||
"output": "EMTAB13526_Lung_sc.h5ad", | ||||||||||||||||||||||||||||
"dataset_id": "E-MTAB-13526", | ||||||||||||||||||||||||||||
"dataset_name": "E-MTAB-13526", | ||||||||||||||||||||||||||||
"dataset_url": "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526", | ||||||||||||||||||||||||||||
"dataset_reference": "https://doi.org/10.1038/s41467-024-48700-8", | ||||||||||||||||||||||||||||
"dataset_summary": "This dataset contains scRNA-seq data from human lung cancer cells.", | ||||||||||||||||||||||||||||
"dataset_description": "This dataset contains scRNA-seq data from human lung cancer cells.", | ||||||||||||||||||||||||||||
"dataset_organism": "Homo sapiens" | ||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
meta = { | ||||||||||||||||||||||||||||
"temp_dir": "./tmp/nsclc_sc_zuani/", | ||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
## VIASH END | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
#os.system(f'wget "ftp://ftp.ebi.ac.uk/biostudies/fire/E-MTAB-/526/E-MTAB-13526/Files/10X_Lung_Tumour_Annotated_v2.h5ad" -P {meta["temp_dir"]}') | ||||||||||||||||||||||||||||
#adata = ad.read_h5ad( f'{meta["temp_dir"]}10X_Lung_Tumour_Annotated_v2.h5ad', backed="r") | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
TMP_DIR = Path(meta["temp_dir"] or "./tmp") | ||||||||||||||||||||||||||||
TMP_DIR.mkdir(parents=True, exist_ok=True) | ||||||||||||||||||||||||||||
FILE_PATHS = {"file": TMP_DIR / "cropped_sc.h5ad"} | ||||||||||||||||||||||||||||
os.system(f'wget http://192.168.2.46:8000/file/cropped_sc.h5ad -P ./tmp/') | ||||||||||||||||||||||||||||
adata = ad.read_h5ad( './tmp/cropped_sc.h5ad') | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
genes_sum = adata.X.toarray().sum(0) | ||||||||||||||||||||||||||||
adata = adata[:, genes_sum != 0] | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
rename_obs_keys = { | ||||||||||||||||||||||||||||
"cell_type": 'Cell types'} | ||||||||||||||||||||||||||||
adata.obs = adata.obs.rename(columns={old:new for new,old in rename_obs_keys.items()}) | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
store_info = { | ||||||||||||||||||||||||||||
"dataset_id": "E-MTAB-13526", | ||||||||||||||||||||||||||||
"assay": "Unknown", | ||||||||||||||||||||||||||||
"sex": "female, male", | ||||||||||||||||||||||||||||
"tissue": "lung", | ||||||||||||||||||||||||||||
"disease": "lung adenocarcinoma, normal, non-small cell lung cancer, lung squamous cell carcinoma", | ||||||||||||||||||||||||||||
"organism": "Homo sapiens", | ||||||||||||||||||||||||||||
"tissue_general": "lung", | ||||||||||||||||||||||||||||
"development_stage": "adult", | ||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||
for key in ["dataset_id", "tissue", "organism", "tissue_general", "development_stage"]: | ||||||||||||||||||||||||||||
adata.obs[key] = pd.Categorical([store_info[key]] * adata.n_obs, categories=[store_info[key]]) | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
uns_info = { "dataset_id": "E-MTAB-13526" , | ||||||||||||||||||||||||||||
"dataset_name":"E-MTAB-13526" , | ||||||||||||||||||||||||||||
"dataset_url":"https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" , | ||||||||||||||||||||||||||||
"dataset_reference": "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526", | ||||||||||||||||||||||||||||
"dataset_summary": 'none', | ||||||||||||||||||||||||||||
"dataset_description":'none', | ||||||||||||||||||||||||||||
"dataset_organism": 'Homo sapiens' | ||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]: | ||||||||||||||||||||||||||||
adata.uns[key] = uns_info[key] | ||||||||||||||||||||||||||||
Comment on lines
+63
to
+73
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. values like 'dataset_id' and 'dataset_name' are arguments, but are also being hardcoded here. These values should be retrieved from the par and should be passed as part of the dataset script.
Suggested change
make sure to add a script similar to this one: https://github.com/openproblems-bio/task_ist_preprocessing/blob/ea67087326ae00912e0006d1f643d990576ed414/scripts/create_resources/process_10x_xenium.sh |
||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
adata.var["gene_symbol"] = adata.var_names | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
adata.layers['counts'] = adata.X.toarray() | ||||||||||||||||||||||||||||
del adata.X | ||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||
print("Writing adata", flush=True) | ||||||||||||||||||||||||||||
adata.write_h5ad(par["output"], compression="gzip") |
Original file line number | Diff line number | Diff line change | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,86 @@ | ||||||||||||||||||||
name: process_nsclc_sc_zuani | ||||||||||||||||||||
namespace: datasets/workflows | ||||||||||||||||||||
|
||||||||||||||||||||
argument_groups: | ||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add a section for the input file?
Suggested change
|
||||||||||||||||||||
- name: Caching settings | ||||||||||||||||||||
arguments: | ||||||||||||||||||||
- type: boolean | ||||||||||||||||||||
name: --keep_files | ||||||||||||||||||||
required: false | ||||||||||||||||||||
description: Whether to remove the downloaded files after processing. | ||||||||||||||||||||
default: false | ||||||||||||||||||||
- name: Metadata | ||||||||||||||||||||
arguments: | ||||||||||||||||||||
- type: string | ||||||||||||||||||||
name: --dataset_id | ||||||||||||||||||||
description: "A unique identifier for the dataset" | ||||||||||||||||||||
required: false | ||||||||||||||||||||
default: "E-MTAB-13526" | ||||||||||||||||||||
- name: --dataset_name | ||||||||||||||||||||
type: string | ||||||||||||||||||||
description: Nicely formatted name. | ||||||||||||||||||||
required: false | ||||||||||||||||||||
default: "E-MTAB-13526" | ||||||||||||||||||||
- type: string | ||||||||||||||||||||
name: --dataset_url | ||||||||||||||||||||
description: Link to the original source of the dataset. | ||||||||||||||||||||
required: false | ||||||||||||||||||||
default: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" | ||||||||||||||||||||
- name: --dataset_reference | ||||||||||||||||||||
type: string | ||||||||||||||||||||
description: Bibtex reference of the paper in which the dataset was published. | ||||||||||||||||||||
required: false | ||||||||||||||||||||
default: "https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-13526" | ||||||||||||||||||||
- name: --dataset_summary | ||||||||||||||||||||
type: string | ||||||||||||||||||||
description: Short description of the dataset. | ||||||||||||||||||||
required: true | ||||||||||||||||||||
default: "We performed single cell RNA sequencing (scRNA-seq) of NSCLC tumours and matched, adjacent, non-involved lung tissue from 24 patients. The data set is composed of approximately 900,000 cells from two different populations: CD235- (haematopoietic and non-haematopoietic cells depleted of erythrocytes), and CD45+ (all haematopoietic cells)." | ||||||||||||||||||||
- name: --dataset_description | ||||||||||||||||||||
type: string | ||||||||||||||||||||
description: Long description of the dataset. | ||||||||||||||||||||
required: true | ||||||||||||||||||||
default: "We performed single cell RNA sequencing (scRNA-seq) of NSCLC tumours and matched, adjacent, non-involved lung tissue from 24 patients. The data set is composed of approximately 900,000 cells from two different populations: CD235- (haematopoietic and non-haematopoietic cells depleted of erythrocytes), and CD45+ (all haematopoietic cells)." | ||||||||||||||||||||
- name: --dataset_organism | ||||||||||||||||||||
type: string | ||||||||||||||||||||
description: The organism of the sample in the dataset. | ||||||||||||||||||||
required: false | ||||||||||||||||||||
default: "human" | ||||||||||||||||||||
- name: Outputs | ||||||||||||||||||||
arguments: | ||||||||||||||||||||
- name: "--output" | ||||||||||||||||||||
__merge__: /src/api/file_common_scrnaseq.yaml | ||||||||||||||||||||
direction: output | ||||||||||||||||||||
required: true | ||||||||||||||||||||
default: "$id/dataset.h5ad" | ||||||||||||||||||||
- name: "--output_meta" | ||||||||||||||||||||
direction: "output" | ||||||||||||||||||||
type: file | ||||||||||||||||||||
description: "Dataset metadata" | ||||||||||||||||||||
default: "$id/dataset_meta.yaml" | ||||||||||||||||||||
|
||||||||||||||||||||
resources: | ||||||||||||||||||||
- type: nextflow_script | ||||||||||||||||||||
path: main.nf | ||||||||||||||||||||
entrypoint: run_wf | ||||||||||||||||||||
- path: /common/nextflow_helpers/helper.nf | ||||||||||||||||||||
|
||||||||||||||||||||
dependencies: | ||||||||||||||||||||
- name: datasets/loaders/nsclc_sc_zuani | ||||||||||||||||||||
# - name: datasets/processors/subsample | ||||||||||||||||||||
# repository: openproblems | ||||||||||||||||||||
- name: datasets/normalization/log_cp | ||||||||||||||||||||
repository: openproblems | ||||||||||||||||||||
- name: datasets/processors/pca | ||||||||||||||||||||
repository: openproblems | ||||||||||||||||||||
- name: datasets/processors/hvg | ||||||||||||||||||||
repository: openproblems | ||||||||||||||||||||
- name: datasets/processors/knn | ||||||||||||||||||||
repository: openproblems | ||||||||||||||||||||
- name: h5ad/extract_uns_metadata | ||||||||||||||||||||
repository: core | ||||||||||||||||||||
|
||||||||||||||||||||
runners: | ||||||||||||||||||||
- type: nextflow | ||||||||||||||||||||
directives: | ||||||||||||||||||||
label: [midcpu, midmem, hightime] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
def resources_dir = params.resources_dir | ||
def config = params.config | ||
|
||
include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" | ||
|
||
workflow auto { | ||
findStates(params, meta.config) | ||
| meta.workflow.run( | ||
auto: [publish: "state"] | ||
) | ||
} | ||
|
||
workflow run_wf { | ||
take: | ||
input_ch | ||
|
||
main: | ||
output_ch = input_ch | ||
|
||
// copy id to the state | ||
| map{ id, state -> | ||
def new_state = state + [dataset_id: id] | ||
[id, new_state] | ||
} | ||
|
||
| nsclc_sc_zuani.run( | ||
fromState: [ | ||
"cancer_subtypes", | ||
"dataset_id", | ||
"dataset_name", | ||
"dataset_url", | ||
"dataset_reference", | ||
"dataset_summary", | ||
"dataset_description", | ||
"dataset_organism", | ||
], | ||
toState: [ | ||
"output_raw": "output" | ||
] | ||
) | ||
|
||
| log_cp.run( | ||
key: "log_cp10k", | ||
fromState: [ | ||
"input": "output_raw" | ||
], | ||
args: [ | ||
"normalization_id": "log_cp10k", | ||
"n_cp": 10000 | ||
], | ||
toState: [ | ||
"output_normalized": "output" | ||
] | ||
) | ||
| hvg.run( | ||
fromState: ["input": "output_normalized"], | ||
toState: ["output_hvg": "output"] | ||
) | ||
|
||
| pca.run( | ||
fromState: ["input": "output_hvg"], | ||
toState: ["output_pca": "output" ] | ||
) | ||
|
||
| knn.run( | ||
fromState: ["input": "output_pca"], | ||
toState: ["output_knn": "output"] | ||
) | ||
// add synonym | ||
| map{ id, state -> | ||
[id, state + [output_dataset: state.output_knn]] | ||
} | ||
|
||
| extract_uns_metadata.run( | ||
fromState: { id, state -> | ||
def schema = findArgumentSchema(meta.config, "output_dataset") | ||
// workaround: convert GString to String | ||
schema = iterateMap(schema, { it instanceof GString ? it.toString() : it }) | ||
def schemaYaml = tempFile("schema.yaml") | ||
writeYaml(schema, schemaYaml) | ||
[ | ||
"input": state.output_dataset, | ||
"schema": schemaYaml | ||
] | ||
}, | ||
toState: ["output_meta": "output"] | ||
) | ||
|
||
| setState([ | ||
"output_dataset": "output_dataset", | ||
"output_meta": "output_meta" | ||
]) | ||
|
||
emit: | ||
output_ch | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#!/bin/bash | ||
|
||
# get the root of the directory | ||
REPO_ROOT=$(git rev-parse --show-toplevel) | ||
|
||
# ensure that the command below is run from the root of the repository | ||
cd "$REPO_ROOT" | ||
|
||
set -e | ||
|
||
# Create local output directory | ||
output_dir="resources/datasets" | ||
|
||
if [ ! -d "$output_dir" ]; then | ||
mkdir -p "$output_dir" | ||
fi | ||
|
||
cat > /tmp/params.yaml << HERE | ||
param_list: | ||
- id: process_nsclc_sc_zuani/process_nsclc_sc_zuani | ||
|
||
keep_files: false | ||
|
||
output_dataset: "\$id/dataset.h5ad" | ||
output_meta: "\$id/dataset_meta.yaml" | ||
output_state: "\$id/state.yaml" | ||
publish_dir: "$output_dir" | ||
HERE | ||
|
||
# Run nextflow workflow locally | ||
nextflow run . \ | ||
-main-script target/nextflow/datasets/workflows/process_nsclc_sc_zuani/main.nf \ | ||
-params-file /tmp/params.yaml \ | ||
-profile docker \ | ||
-c common/nextflow_helpers/labels_ci.config |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This code is pointing to local endpoints, which unfortunately won't work.