diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index d80b7f521..ddd8da201 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -2,6 +2,7 @@ auspice_json_prefix: ncov_gisaid # Define custom rules for pre- or post-standard workflow processing of data. custom_rules: + - workflow/snakemake_rules/prefilter.smk - workflow/snakemake_rules/export_for_nextstrain.smk # These parameters are only used by the `export_for_nextstrain` rule and shouldn't need to be modified. @@ -25,7 +26,7 @@ files: inputs: - name: gisaid - metadata: "s3://nextstrain-ncov-private/metadata.tsv.zst" + metadata: "data/prefiltered_metadata.tsv" aligned: "s3://nextstrain-ncov-private/aligned.fasta.zst" skip_sanitize_metadata: true diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 4b3f92f12..ddafd8652 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -297,6 +297,8 @@ rule subsample: params: group_by = _get_specific_subsampling_setting("group_by", optional=True), group_by_weights = _get_specific_subsampling_setting("group_by_weights", optional=True), + # only set this if using group_by_weights + output_group_by_weights = lambda wildcards: f"--output-group-by-sizes results/{wildcards.build_name}/sizes-{wildcards.subsample}.tsv" if _get_subsampling_settings(wildcards).get("group_by_weights", False) else "", sequences_per_group = _get_specific_subsampling_setting("seq_per_group", optional=True), subsample_max_sequences = _get_specific_subsampling_setting("max_sequences", optional=True), sampling_scheme = _get_specific_subsampling_setting("sampling_scheme", optional=True), @@ -330,6 +332,7 @@ rule subsample: {params.sequences_per_group} \ {params.subsample_max_sequences} \ {params.sampling_scheme} \ + {params.output_group_by_weights} \ --output-strains {output.strains} 2>&1 | tee {log} """ diff --git a/workflow/snakemake_rules/prefilter.smk b/workflow/snakemake_rules/prefilter.smk new file mode 100644 index 000000000..d07f65e54 --- /dev/null +++ b/workflow/snakemake_rules/prefilter.smk @@ -0,0 +1,28 @@ +rule download_metadata: + params: + metadata_url="s3://nextstrain-ncov-private/metadata.tsv.zst", + output: + metadata="data/metadata.tsv.zst", + shell: + r""" + aws s3 cp {params.metadata_url} {output.metadata} + """ + +rule filter_metadata: + input: + metadata="data/metadata.tsv.zst", + include = config["files"]["include"], + output: + metadata="data/prefiltered_metadata.tsv", + params: + max_sequences=500000, + group_by="division year month", + shell: + r""" + augur filter \ + --metadata {input.metadata} \ + --subsample-max-sequences {params.max_sequences} \ + --include {input.include} \ + --group-by {params.group_by} \ + --output-metadata {output.metadata} + """