diff --git a/workflow/resources/datavzrd/fp-fn-config.yte.yaml b/workflow/resources/datavzrd/fp-fn-config.yte.yaml index 1feac08..ff75829 100644 --- a/workflow/resources/datavzrd/fp-fn-config.yte.yaml +++ b/workflow/resources/datavzrd/fp-fn-config.yte.yaml @@ -31,10 +31,14 @@ views: ?view if view == "main" else f"by {view}": ?if view == "main": desc: | + ?f""" The values 0/0, 0/1, and 1/1 represent the genotype (not present, heterozygous, homozygous). If a callset does not have an entry, this means that the variant matches the truth. Rows and columns are sorted according to an unsupervised hiearchical clustering with hamming distance and ward linkage. + + Benchmark version: {wildcards.genome} {params.version} + """ ?else: desc: | ?f""" @@ -46,6 +50,8 @@ views: The values 0/0, 0/1, and 1/1 represent the genotype (not present, heterozygous, homozygous). If a callset does not have an entry, this means that the variant matches the truth. + + Benchmark version: {wildcards.genome} {params.version} """ dataset: ?view page-size: 12 diff --git a/workflow/resources/datavzrd/precision-recall-config.yte.yaml b/workflow/resources/datavzrd/precision-recall-config.yte.yaml index d5ee2fb..23d2a3e 100644 --- a/workflow/resources/datavzrd/precision-recall-config.yte.yaml +++ b/workflow/resources/datavzrd/precision-recall-config.yte.yaml @@ -22,6 +22,7 @@ views: results-plot: dataset: results desc: | + ?f""" Precision and recall are calculated by matching variants between each callset and truth, stratified by coverage categories. The matching ignores genotype differences. Instead, genotype mismatches are displayed in the "genotype mismatch rate" @@ -32,6 +33,9 @@ views: are not shown because estimates are unreliable in such cases. Zoom in by scrolling inside of the plot area, pan by dragging, and reset by double-clicking. + + Benchmark version: {params.genome} {params.version} + """ render-plot: spec: | { @@ -71,8 +75,12 @@ views: dataset: results ?if params.vaf: desc: | + ?f""" Precision and recall are calculated by matching variants between each callset and truth, stratified by coverage categories. Stratified by VAF. + + Benchmark version: {params.genome} {params.version} + """ page-size: 12 render-table: columns: @@ -121,7 +129,7 @@ views: - "#6baed6" vaf: plot: - heatmap: + heatmap: scale: linear custom-content: function(value, row) { @@ -152,10 +160,14 @@ views: - "#1a833fff" ?else: desc: | + ?f""" Precision and recall are calculated by matching variants between each callset and truth, stratified by coverage categories. The matching ignores genotype differences. Instead, genotype mismatches are displayed in the "genotype mismatch rate" column. + + Benchmark version: {params.genome} {params.version} + """ page-size: 12 render-table: columns: diff --git a/workflow/resources/presets.yaml b/workflow/resources/presets.yaml index 0ec9ea9..fcab238 100644 --- a/workflow/resources/presets.yaml +++ b/workflow/resources/presets.yaml @@ -50,6 +50,7 @@ benchmarks: genomes: na12878: + version: 4.2.1 truth: grch37: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh37/HG001_GRCh37_1_22_v4.2.1_benchmark.vcf.gz grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz @@ -58,6 +59,7 @@ genomes: grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.bed chm-eval: + version: 0.5 archive: https://github.com/lh3/CHM-eval/releases/download/v0.5/CHM-evalkit-20180222.tar truth: grch38: full.38.vcf.gz @@ -67,6 +69,7 @@ genomes: grch37: full.37m.bed.gz seqc2-somatic: + version: 1.2 truth: grch38: snvs: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/release/latest/high-confidence_sSNV_in_HC_regions_v1.2.vcf.gz @@ -76,6 +79,7 @@ genomes: somatic: true na12878-somatic: + version: 4.2.1 truth: grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz confidence-regions: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 382838d..d46c409 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -93,7 +93,7 @@ def get_plot_cov_labels(): # TODO check if ever used anywhere def label(name): lower, upper = get_cov_interval(name) if upper: - return f"{lower}-{upper-1}" + return f"{lower}-{upper - 1}" return f"≥{lower}" return {name: label(name) for name in low_coverages} @@ -490,6 +490,20 @@ def get_collect_precision_recall_input(wildcards): ) +def get_genome_name(wildcards): + if hasattr(wildcards, "benchmark"): + return get_benchmark(wildcards.benchmark).get("genome") + if hasattr(wildcards, "callset"): + benchmark = config["variant-calls"][wildcards.callset]["benchmark"] + return get_benchmark(benchmark).get("genome") + else: + return wildcards.genome + + +def get_genome_version(wildcards): + return genomes[get_genome_name(wildcards)].get("version") + + def get_genome_callsets(genome): return sorted( callset diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index 30db27d..e99b03a 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -273,7 +273,10 @@ rule report_precision_recall: directory("results/report/precision-recall/{benchmark}/{vartype}"), htmlindex="index.html", category="precision/recall", - labels={"benchmark": "{benchmark}", "vartype": "{vartype}"}, + labels={ + "benchmark": "{benchmark}", + "vartype": "{vartype}", + }, ), log: "logs/datavzrd/precision-recall/{benchmark}/{vartype}.log", @@ -281,6 +284,8 @@ rule report_precision_recall: somatic=get_somatic_status, vaf=get_vaf_status, high_coverage=get_high_coverage_status, + genome=get_genome_name, + version=get_genome_version, wrapper: "v3.10.1/utils/datavzrd" @@ -343,5 +348,6 @@ rule report_fp_fn: "logs/datavzrd/fp-fn/{genome}/{cov}/{classification}.log", params: labels=lambda w: get_callsets_labels(get_genome_callsets(w.genome)), + version=get_genome_version, wrapper: "v3.10.1/utils/datavzrd"