From 90874b3621da86e978d6a14003a8ce113f9eb90d Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Thu, 25 Jul 2024 10:17:33 -0700 Subject: [PATCH 1/4] Demo prefilter rule for Nextstrain GISAID build Adds a prefilter rule to reduce the size of the input metadata for the GISAID build before running the whole workflow. --- .../nextstrain-gisaid/builds.yaml | 3 ++- workflow/snakemake_rules/prefilter.smk | 26 +++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 workflow/snakemake_rules/prefilter.smk diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index d80b7f521..ddd8da201 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -2,6 +2,7 @@ auspice_json_prefix: ncov_gisaid # Define custom rules for pre- or post-standard workflow processing of data. custom_rules: + - workflow/snakemake_rules/prefilter.smk - workflow/snakemake_rules/export_for_nextstrain.smk # These parameters are only used by the `export_for_nextstrain` rule and shouldn't need to be modified. @@ -25,7 +26,7 @@ files: inputs: - name: gisaid - metadata: "s3://nextstrain-ncov-private/metadata.tsv.zst" + metadata: "data/prefiltered_metadata.tsv" aligned: "s3://nextstrain-ncov-private/aligned.fasta.zst" skip_sanitize_metadata: true diff --git a/workflow/snakemake_rules/prefilter.smk b/workflow/snakemake_rules/prefilter.smk new file mode 100644 index 000000000..d66f009a9 --- /dev/null +++ b/workflow/snakemake_rules/prefilter.smk @@ -0,0 +1,26 @@ +rule download_metadata: + params: + metadata_url="s3://nextstrain-ncov-private/metadata.tsv.zst", + output: + metadata="data/metadata.tsv.zst", + shell: + """ + aws s3 cp {params.metadata_url} {output.metadata} + """ + +rule filter_metadata: + input: + metadata="data/metadata.tsv.zst", + output: + metadata="data/prefiltered_metadata.tsv", + params: + max_sequences=500000, + group_by="division year month", + shell: + """ + augur filter \ + --metadata {input.metadata} \ + --subsample-max-sequences {params.max_sequences} \ + --group-by {params.group_by} \ + --output-metadata {output.metadata} + """ From ce86db21c908ef33f0b22bd61b1c8944e3ab61f9 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 15 Nov 2024 18:04:28 -0800 Subject: [PATCH 2/4] =?UTF-8?q?=F0=9F=9A=A7=20output=20group=20size=20tabl?= =?UTF-8?q?e=20when=20using=20weighted=20sampling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This makes it easier to inspect the effect of a prefilter rule. --- workflow/snakemake_rules/main_workflow.smk | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 4b3f92f12..ddafd8652 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -297,6 +297,8 @@ rule subsample: params: group_by = _get_specific_subsampling_setting("group_by", optional=True), group_by_weights = _get_specific_subsampling_setting("group_by_weights", optional=True), + # only set this if using group_by_weights + output_group_by_weights = lambda wildcards: f"--output-group-by-sizes results/{wildcards.build_name}/sizes-{wildcards.subsample}.tsv" if _get_subsampling_settings(wildcards).get("group_by_weights", False) else "", sequences_per_group = _get_specific_subsampling_setting("seq_per_group", optional=True), subsample_max_sequences = _get_specific_subsampling_setting("max_sequences", optional=True), sampling_scheme = _get_specific_subsampling_setting("sampling_scheme", optional=True), @@ -330,6 +332,7 @@ rule subsample: {params.sequences_per_group} \ {params.subsample_max_sequences} \ {params.sampling_scheme} \ + {params.output_group_by_weights} \ --output-strains {output.strains} 2>&1 | tee {log} """ From 3dc75f1fdcedcf807670d5d7a16185283c2b7386 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 18 Nov 2024 14:28:14 -0800 Subject: [PATCH 3/4] =?UTF-8?q?=F0=9F=9A=A7=20Use=20raw,=20triple-quoted?= =?UTF-8?q?=20shell=20blocks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- workflow/snakemake_rules/prefilter.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/snakemake_rules/prefilter.smk b/workflow/snakemake_rules/prefilter.smk index d66f009a9..9d4f9aa3f 100644 --- a/workflow/snakemake_rules/prefilter.smk +++ b/workflow/snakemake_rules/prefilter.smk @@ -4,7 +4,7 @@ rule download_metadata: output: metadata="data/metadata.tsv.zst", shell: - """ + r""" aws s3 cp {params.metadata_url} {output.metadata} """ @@ -17,7 +17,7 @@ rule filter_metadata: max_sequences=500000, group_by="division year month", shell: - """ + r""" augur filter \ --metadata {input.metadata} \ --subsample-max-sequences {params.max_sequences} \ From c8bb915ac2e7096af57c59fab43c0822108fffac Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Mon, 18 Nov 2024 14:30:48 -0800 Subject: [PATCH 4/4] =?UTF-8?q?=F0=9F=9A=A7=20include=20root=20in=20prefil?= =?UTF-8?q?ter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- workflow/snakemake_rules/prefilter.smk | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflow/snakemake_rules/prefilter.smk b/workflow/snakemake_rules/prefilter.smk index 9d4f9aa3f..d07f65e54 100644 --- a/workflow/snakemake_rules/prefilter.smk +++ b/workflow/snakemake_rules/prefilter.smk @@ -11,6 +11,7 @@ rule download_metadata: rule filter_metadata: input: metadata="data/metadata.tsv.zst", + include = config["files"]["include"], output: metadata="data/prefiltered_metadata.tsv", params: @@ -21,6 +22,7 @@ rule filter_metadata: augur filter \ --metadata {input.metadata} \ --subsample-max-sequences {params.max_sequences} \ + --include {input.include} \ --group-by {params.group_by} \ --output-metadata {output.metadata} """