Skip to content

Improvements to workflows #20

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions _viash.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,3 @@ repositories:
type: github
repo: openproblems-bio/openproblems
tag: build/main
- name: core
type: github
repo: openproblems-bio/core
tag: build/main
path: viash/core
18 changes: 18 additions & 0 deletions scripts/create_resources/process_datasets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,30 @@ param_list:
- id: "mouse_brain_combined/rep1"
input_sp: "$input_dir/10x_xenium/2023_10x_mouse_brain_xenium/rep1/dataset.zarr"
input_sc: "$input_dir/allen_brain_cell_atlas/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad"
dataset_name: "Mouse brain combined 2023 tenx Xenium replicate 1 2023 Yao scRNAseq"
dataset_url: "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard;https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE246717"
dataset_reference: "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard;10.1038/s41586-023-06812-z"
dataset_summary: "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1);A high-resolution scRNAseq atlas of cell types in the whole mouse brain"
dataset_description: "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1). Replicate results demonstrate the high reproducibility of data generated by the platform. 10x Genomics obtained tissue from a C57BL/6 mouse from Charles River Laboratories. Three adjacent 10µm sections were placed on the same slide. Tissues were prepared following the demonstrated protocols Xenium In Situ for Fresh Frozen Tissues - Tissue Preparation Guide (CG000579) and Xenium In Situ for Fresh Frozen Tissues - Fixation & Permeabilization (CG000581).;See dataset_reference for more information. Note that we only took the 10xv2 data from the dataset."
dataset_organism: "mus_musculus"
- id: "mouse_brain_combined/rep2"
input_sp: "$input_dir/10x_xenium/2023_10x_mouse_brain_xenium/rep2/dataset.zarr"
input_sc: "$input_dir/allen_brain_cell_atlas/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad"
dataset_name: "Mouse brain combined 2023 tenx Xenium replicate 2 2023 Yao scRNAseq"
dataset_url: "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard;https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE246717"
dataset_reference: "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard;10.1038/s41586-023-06812-z"
dataset_summary: "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1);A high-resolution scRNAseq atlas of cell types in the whole mouse brain"
dataset_description: "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1). Replicate results demonstrate the high reproducibility of data generated by the platform. 10x Genomics obtained tissue from a C57BL/6 mouse from Charles River Laboratories. Three adjacent 10µm sections were placed on the same slide. Tissues were prepared following the demonstrated protocols Xenium In Situ for Fresh Frozen Tissues - Tissue Preparation Guide (CG000579) and Xenium In Situ for Fresh Frozen Tissues - Fixation & Permeabilization (CG000581).;See dataset_reference for more information. Note that we only took the 10xv2 data from the dataset."
dataset_organism: "mus_musculus"
- id: "mouse_brain_combined/rep3"
input_sp: "$input_dir/10x_xenium/2023_10x_mouse_brain_xenium/rep3/dataset.zarr"
input_sc: "$input_dir/allen_brain_cell_atlas/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad"
dataset_name: "Mouse brain combined 2023 tenx Xenium replicate 3 2023 Yao scRNAseq"
dataset_url: "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard;https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE246717"
dataset_reference: "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard;10.1038/s41586-023-06812-z"
dataset_summary: "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1);A high-resolution scRNAseq atlas of cell types in the whole mouse brain"
dataset_description: "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1). Replicate results demonstrate the high reproducibility of data generated by the platform. 10x Genomics obtained tissue from a C57BL/6 mouse from Charles River Laboratories. Three adjacent 10µm sections were placed on the same slide. Tissues were prepared following the demonstrated protocols Xenium In Situ for Fresh Frozen Tissues - Tissue Preparation Guide (CG000579) and Xenium In Situ for Fresh Frozen Tissues - Fixation & Permeabilization (CG000581).;See dataset_reference for more information. Note that we only took the 10xv2 data from the dataset."
dataset_organism: "mus_musculus"

output_sc: "\$id/output_sc.h5ad"
output_sp: "\$id/output_sp.zarr"
Expand Down
53 changes: 17 additions & 36 deletions scripts/create_resources/process_vizgen_merscope.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ cat > /tmp/params.yaml << HERE
param_list:

- id: "vizgen_merscope/2022_vizgen_human_breast_cancer_merfish/rep1"
gcloud_bucket: "vz-ffpe-showcase"
dataset_bucket_name: "HumanBreastCancerPatient1"
input: "gs://vz-ffpe-showcase/HumanBreastCancerPatient1"
dataset_name: "Vizgen Human Breast Cancer MERFISH Patient1"
dataset_url: "https://info.vizgen.com/ffpe-showcase?submissionGuid=a93dbab5-c128-4269-afe3-82ea2bf9cdaf"
dataset_summary: "Human Breast Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand All @@ -24,8 +23,7 @@ param_list:
segmentation_id: ["cell"]

- id: "vizgen_merscope/2022_vizgen_human_liver_cancer_merfish/rep1"
gcloud_bucket: "vz-ffpe-showcase"
dataset_bucket_name: "HumanLiverCancerPatient1"
input: "gs://vz-ffpe-showcase/HumanLiverCancerPatient1"
dataset_name: "Vizgen Human Liver Cancer MERFISH Patient1"
dataset_url: "https://info.vizgen.com/ffpe-showcase?submissionGuid=a93dbab5-c128-4269-afe3-82ea2bf9cdaf"
dataset_summary: "Human Liver Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand All @@ -34,8 +32,7 @@ param_list:
segmentation_id: ["cell"]

- id: "vizgen_merscope/2022_vizgen_human_liver_cancer_merfish/rep2"
gcloud_bucket: "vz-ffpe-showcase"
dataset_bucket_name: "HumanLiverCancerPatient2"
input: "gs://vz-ffpe-showcase/HumanLiverCancerPatient2"
dataset_name: "Vizgen Human Liver Cancer MERFISH Patient2"
dataset_url: "https://info.vizgen.com/ffpe-showcase?submissionGuid=a93dbab5-c128-4269-afe3-82ea2bf9cdaf"
dataset_summary: "Human Liver Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand All @@ -44,8 +41,7 @@ param_list:
segmentation_id: ["cell"]

- id: "vizgen_merscope/2022_vizgen_human_lung_cancer_merfish/rep1"
gcloud_bucket: "vz-ffpe-showcase"
dataset_bucket_name: "HumanLungCancerPatient1"
input: "gs://vz-ffpe-showcase/HumanLungCancerPatient1"
dataset_name: "Vizgen Human Lung Cancer MERFISH Patient1"
dataset_url: "https://info.vizgen.com/ffpe-showcase?submissionGuid=a93dbab5-c128-4269-afe3-82ea2bf9cdaf"
dataset_summary: "Human Lung Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand All @@ -54,8 +50,7 @@ param_list:
segmentation_id: ["cell"]

- id: "vizgen_merscope/2022_vizgen_human_lung_cancer_merfish/rep2"
gcloud_bucket: "vz-ffpe-showcase"
dataset_bucket_name: "HumanLungCancerPatient2"
input: "gs://vz-ffpe-showcase/HumanLungCancerPatient2"
dataset_name: "Vizgen Human Lung Cancer MERFISH Patient2"
dataset_url: "https://info.vizgen.com/ffpe-showcase?submissionGuid=a93dbab5-c128-4269-afe3-82ea2bf9cdaf"
dataset_summary: "Human Lung Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand All @@ -78,14 +73,11 @@ tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
--config common/nextflow_helpers/labels_tw.config \
--labels datasets,vizgen_merscope



# More datasets that can be simply added:
# TODO: Make a decision on replicate naming (see ovarian cancer replicate that has multiple slices)

# - id: "vizgen_merscope/2022_vizgen_human_colon_cancer_merfish/rep1"
# gcloud_bucket: "vz-ffpe-showcase"
# dataset_bucket_name: "HumanColonCancerPatient1"
# input: "gs://vz-ffpe-showcase/HumanColonCancerPatient1"
# dataset_name: "2022 Vizgen Human Colon Cancer MERFISH Patient1"
# dataset_url: "https://info.vizgen.com/ffpe-showcase"
# dataset_summary: "Human Colon Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand All @@ -94,8 +86,7 @@ tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
# segmentation_id: ["cell"]

# - id: "vizgen_merscope/2022_vizgen_human_colon_cancer_merfish/rep2"
# gcloud_bucket: "vz-ffpe-showcase"
# dataset_bucket_name: "HumanColonCancerPatient2"
# input: "gs://vz-ffpe-showcase/HumanColonCancerPatient2"
# dataset_name: "2022 Vizgen Human Colon Cancer MERFISH Patient2"
# dataset_url: "https://info.vizgen.com/ffpe-showcase"
# dataset_summary: "Human Colon Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand All @@ -104,8 +95,7 @@ tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
# segmentation_id: ["cell"]

# - id: "vizgen_merscope/2022_vizgen_human_melanoma_merfish/rep1"
# gcloud_bucket: "vz-ffpe-showcase"
# dataset_bucket_name: "HumanMelanomaPatient1"
# input: "gs://vz-ffpe-showcase/HumanMelanomaPatient1"
# dataset_name: "2022 Vizgen Human Melanoma MERFISH Patient1"
# dataset_url: "https://info.vizgen.com/ffpe-showcase"
# dataset_summary: "Human Melanoma data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand All @@ -114,8 +104,7 @@ tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
# segmentation_id: ["cell"]

# - id: "vizgen_merscope/2022_vizgen_human_melanoma_merfish/rep2"
# gcloud_bucket: "vz-ffpe-showcase"
# dataset_bucket_name: "HumanMelanomaPatient2"
# input: "gs://vz-ffpe-showcase/HumanMelanomaPatient2"
# dataset_name: "2022 Vizgen Human Melanoma MERFISH Patient2"
# dataset_url: "https://info.vizgen.com/ffpe-showcase"
# dataset_summary: "Human Melanoma data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand All @@ -124,8 +113,7 @@ tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
# segmentation_id: ["cell"]

# - id: "vizgen_merscope/2022_vizgen_human_ovarian_cancer_merfish/rep1"
# gcloud_bucket: "vz-ffpe-showcase"
# dataset_bucket_name: "HumanOvarianCancerPatient1"
# input: "gs://vz-ffpe-showcase/HumanOvarianCancerPatient1"
# dataset_name: "2022 Vizgen Human Ovarian Cancer MERFISH Patient1"
# dataset_url: "https://info.vizgen.com/ffpe-showcase"
# dataset_summary: "Human Ovarian Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand All @@ -135,8 +123,7 @@ tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \

# # Patient 2 has multiple slices
# - id: "vizgen_merscope/2022_vizgen_human_ovarian_cancer_merfish/rep2_slice1"
# gcloud_bucket: "vz-ffpe-showcase"
# dataset_bucket_name: "HumanOvarianCancerPatient2Slice1"
# input: "gs://vz-ffpe-showcase/HumanOvarianCancerPatient2Slice1"
# dataset_name: "2022 Vizgen Human Ovarian Cancer MERFISH Patient2 Slice1"
# dataset_url: "https://info.vizgen.com/ffpe-showcase"
# dataset_summary: "Human Ovarian Cancer data (Slice 1) from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand All @@ -145,8 +132,7 @@ tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
# segmentation_id: ["cell"]

# - id: "vizgen_merscope/2022_vizgen_human_ovarian_cancer_merfish/rep2_slice2"
# gcloud_bucket: "vz-ffpe-showcase"
# dataset_bucket_name: "HumanOvarianCancerPatient2Slice2"
# input: "gs://vz-ffpe-showcase/HumanOvarianCancerPatient2Slice2"
# dataset_name: "2022 Vizgen Human Ovarian Cancer MERFISH Patient2 Slice2"
# dataset_url: "https://info.vizgen.com/ffpe-showcase"
# dataset_summary: "Human Ovarian Cancer data (Slice 2) from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand All @@ -155,8 +141,7 @@ tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
# segmentation_id: ["cell"]

# - id: "vizgen_merscope/2022_vizgen_human_ovarian_cancer_merfish/rep2_slice3"
# gcloud_bucket: "vz-ffpe-showcase"
# dataset_bucket_name: "HumanOvarianCancerPatient2Slice3"
# input: "gs://vz-ffpe-showcase/HumanOvarianCancerPatient2Slice3"
# dataset_name: "2022 Vizgen Human Ovarian Cancer MERFISH Patient2 Slice3"
# dataset_url: "https://info.vizgen.com/ffpe-showcase"
# dataset_summary: "Human Ovarian Cancer data (Slice 3) from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand All @@ -165,8 +150,7 @@ tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
# segmentation_id: ["cell"]

# - id: "vizgen_merscope/2022_vizgen_human_prostate_cancer_merfish/rep1"
# gcloud_bucket: "vz-ffpe-showcase"
# dataset_bucket_name: "HumanProstateCancerPatient1"
# input: "gs://vz-ffpe-showcase/HumanProstateCancerPatient1"
# dataset_name: "2022 Vizgen Human Prostate Cancer MERFISH Patient1"
# dataset_url: "https://info.vizgen.com/ffpe-showcase"
# dataset_summary: "Human Prostate Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand All @@ -175,8 +159,7 @@ tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
# segmentation_id: ["cell"]

# - id: "vizgen_merscope/2022_vizgen_human_prostate_cancer_merfish/rep2"
# gcloud_bucket: "vz-ffpe-showcase"
# dataset_bucket_name: "HumanProstateCancerPatient2"
# input: "gs://vz-ffpe-showcase/HumanProstateCancerPatient2"
# dataset_name: "2022 Vizgen Human Prostate Cancer MERFISH Patient2"
# dataset_url: "https://info.vizgen.com/ffpe-showcase"
# dataset_summary: "Human Prostate Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand All @@ -185,8 +168,7 @@ tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
# segmentation_id: ["cell"]

# - id: "vizgen_merscope/2022_vizgen_human_uterine_cancer_merfish/rep1"
# gcloud_bucket: "vz-ffpe-showcase"
# dataset_bucket_name: "HumanUterineCancerPatient1"
# input: "gs://vz-ffpe-showcase/HumanUterineCancerPatient1"
# dataset_name: "2022 Vizgen Human Uterine Cancer MERFISH Patient1"
# dataset_url: "https://info.vizgen.com/ffpe-showcase"
# dataset_summary: "Human Uterine Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand All @@ -195,8 +177,7 @@ tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
# segmentation_id: ["cell"]

# - id: "vizgen_merscope/2022_vizgen_human_uterine_cancer_merfish/rep2"
# gcloud_bucket: "vz-ffpe-showcase"
# dataset_bucket_name: "HumanUterineCancerPatient2"
# input: "gs://vz-ffpe-showcase/HumanUterineCancerPatient2"
# dataset_name: "2022 Vizgen Human Uterine Cancer MERFISH Patient2"
# dataset_url: "https://info.vizgen.com/ffpe-showcase"
# dataset_summary: "Human Uterine Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
Expand Down
23 changes: 12 additions & 11 deletions scripts/create_test_resources/test_pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,17 +75,18 @@ viash run src/metrics/similarity/config.vsh.yaml -- \

# create a state file
cat >> $OUT_DIR/state.yaml <<EOL
output_sp: $OUT_DIR/raw_ist.zarr
output_sc: $OUT_DIR/scrnaseq_reference.h5ad
output_segmentation: $OUT_DIR/segmentation.zarr
output_transcript_assignments: $OUT_DIR/transcript_assignments.zarr
output_spatial_aggregated_counts: $OUT_DIR/spatial_aggregated_counts.h5ad
output_cell_volumes: $OUT_DIR/cell_volumes.h5ad
output_spatial_normalized_counts: $OUT_DIR/spatial_normalized_counts.h5ad
output_spatial_with_cell_types: $OUT_DIR/spatial_with_cell_types.h5ad
output_spatial_corrected_counts: $OUT_DIR/spatial_corrected_counts.h5ad
output_spatial_qc_col: $OUT_DIR/spatial_qc_col.h5ad
output_score: $OUT_DIR/score.h5ad
id: mouse_brain_combined
output_sp: !file raw_ist.zarr
output_sc: !file scrnaseq_reference.h5ad
output_segmentation: !file segmentation.zarr
output_transcript_assignments: !file transcript_assignments.zarr
output_spatial_aggregated_counts: !file spatial_aggregated_counts.h5ad
output_cell_volumes: !file cell_volumes.h5ad
output_spatial_normalized_counts: !file spatial_normalized_counts.h5ad
output_spatial_with_cell_types: !file spatial_with_cell_types.h5ad
output_spatial_corrected_counts: !file spatial_corrected_counts.h5ad
output_spatial_qc_col: !file spatial_qc_col.h5ad
output_score: !file score.h5ad
EOL

# sync test resources
Expand Down
78 changes: 55 additions & 23 deletions src/api/comp_data_preprocessor.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
namespace: data_processors
info:
type: data_processor
type_info:
Expand All @@ -7,25 +6,58 @@ info:
description: |
This component processes a common single-cell and a common spatial transcriptomics
dataset for the benchmark.
arguments:
- name: "--input_sp"
__merge__: file_common_ist.yaml
direction: input
required: true
- name: "--input_sc"
__merge__: file_common_scrnaseq.yaml
direction: input
required: true
- name: "--output_sp"
__merge__: file_raw_ist.yaml
direction: output
required: true
- name: "--output_sc"
__merge__: file_scrnaseq_reference.yaml
direction: output
required: true
test_resources:
- path: /resources_test/common
dest: resources_test/common
- type: python_script
path: /common/component_tests/run_and_check_output.py

argument_groups:
- name: Inputs
arguments:
- name: "--input_sp"
__merge__: file_common_ist.yaml
required: true
direction: input
- name: "--input_sc"
__merge__: file_common_scrnaseq.yaml
required: true
direction: input
- name: Outputs
arguments:
- name: "--output_sp"
__merge__: file_raw_ist.yaml
required: true
direction: output
default: "$id/output_sp.h5ad"
- name: "--output_sc"
__merge__: file_scrnaseq_reference.yaml
required: true
direction: output
default: "$id/output_sc.h5ad"
- name: Combined Dataset Metadata
description: Metadata for the combined dataset that will be stored.
arguments:
- type: string
name: --dataset_id
description: "A unique identifier for the dataset"
required: true
- name: --dataset_name
type: string
description: Nicely formatted name.
required: true
- type: string
name: --dataset_url
description: Link to the original source of the dataset.
required: false
- name: --dataset_reference
type: string
description: Bibtex reference of the paper in which the dataset was published.
required: false
- name: --dataset_summary
type: string
description: Short description of the dataset.
required: true
- name: --dataset_description
type: string
description: Long description of the dataset.
required: true
- name: --dataset_organism
type: string
description: The organism of the sample in the dataset.
required: false
Loading