diff --git a/workflow/resources/datavzrd/fp-fn-config.yte.yaml b/workflow/resources/datavzrd/fp-fn-config.yte.yaml
index 1feac08..ff75829 100644
--- a/workflow/resources/datavzrd/fp-fn-config.yte.yaml
+++ b/workflow/resources/datavzrd/fp-fn-config.yte.yaml
@@ -31,10 +31,14 @@ views:
     ?view if view == "main" else f"by {view}":
       ?if view == "main":
         desc: |
+          ?f"""
           The values 0/0, 0/1, and 1/1 represent the genotype (not present, heterozygous, homozygous).
           If a callset does not have an entry, this means that the variant matches the truth.
 
           Rows and columns are sorted according to an unsupervised hiearchical clustering with hamming distance and ward linkage.
+
+          Benchmark version: {wildcards.genome} {params.version}
+          """
       ?else:
         desc: |
           ?f"""
@@ -46,6 +50,8 @@ views:
 
           The values 0/0, 0/1, and 1/1 represent the genotype (not present, heterozygous, homozygous).
           If a callset does not have an entry, this means that the variant matches the truth.
+
+          Benchmark version: {wildcards.genome} {params.version}
           """
       dataset: ?view
       page-size: 12
diff --git a/workflow/resources/datavzrd/precision-recall-config.yte.yaml b/workflow/resources/datavzrd/precision-recall-config.yte.yaml
index d5ee2fb..23d2a3e 100644
--- a/workflow/resources/datavzrd/precision-recall-config.yte.yaml
+++ b/workflow/resources/datavzrd/precision-recall-config.yte.yaml
@@ -22,6 +22,7 @@ views:
   results-plot:
     dataset: results
     desc: |
+      ?f"""
       Precision and recall are calculated by matching variants between each callset
       and truth, stratified by coverage categories. The matching ignores genotype
       differences. Instead, genotype mismatches are displayed in the "genotype mismatch rate"
@@ -32,6 +33,9 @@ views:
       are not shown because estimates are unreliable in such cases.
 
       Zoom in by scrolling inside of the plot area, pan by dragging, and reset by double-clicking.
+
+      Benchmark version: {params.genome} {params.version}
+      """
     render-plot:
       spec: |
         {
@@ -71,8 +75,12 @@ views:
     dataset: results
     ?if params.vaf:
       desc: |
+        ?f"""
         Precision and recall are calculated by matching variants between each callset
         and truth, stratified by coverage categories. Stratified by VAF.
+
+        Benchmark version: {params.genome} {params.version}
+        """
       page-size: 12
       render-table:
         columns:
@@ -121,7 +129,7 @@ views:
                     - "#6baed6"
           vaf:
             plot:
-             heatmap:
+              heatmap:
                 scale: linear
                 custom-content:
                   function(value, row) {
@@ -152,10 +160,14 @@ views:
                   - "#1a833fff"
     ?else:
       desc: |
+        ?f"""
         Precision and recall are calculated by matching variants between each callset
         and truth, stratified by coverage categories. The matching ignores genotype
         differences. Instead, genotype mismatches are displayed in the "genotype mismatch rate"
         column.
+
+        Benchmark version: {params.genome} {params.version}
+        """
       page-size: 12
       render-table:
         columns:
diff --git a/workflow/resources/presets.yaml b/workflow/resources/presets.yaml
index 0ec9ea9..fcab238 100644
--- a/workflow/resources/presets.yaml
+++ b/workflow/resources/presets.yaml
@@ -50,6 +50,7 @@ benchmarks:
 
 genomes:
   na12878:
+    version: 4.2.1
     truth:
       grch37: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh37/HG001_GRCh37_1_22_v4.2.1_benchmark.vcf.gz
       grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz
@@ -58,6 +59,7 @@ genomes:
       grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.bed
 
   chm-eval:
+    version: 0.5
     archive: https://github.com/lh3/CHM-eval/releases/download/v0.5/CHM-evalkit-20180222.tar
     truth:
       grch38: full.38.vcf.gz
@@ -67,6 +69,7 @@ genomes:
       grch37: full.37m.bed.gz
 
   seqc2-somatic:
+    version: 1.2
     truth:
       grch38:
         snvs: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/release/latest/high-confidence_sSNV_in_HC_regions_v1.2.vcf.gz
@@ -76,6 +79,7 @@ genomes:
     somatic: true
 
   na12878-somatic:
+    version: 4.2.1
     truth:
       grch38: https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/release/NA12878_HG001/NISTv4.2.1/GRCh38/HG001_GRCh38_1_22_v4.2.1_benchmark.vcf.gz
     confidence-regions:
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 382838d..d46c409 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -93,7 +93,7 @@ def get_plot_cov_labels():  # TODO check if ever used anywhere
     def label(name):
         lower, upper = get_cov_interval(name)
         if upper:
-            return f"{lower}-{upper-1}"
+            return f"{lower}-{upper - 1}"
         return f"≥{lower}"
 
     return {name: label(name) for name in low_coverages}
@@ -490,6 +490,20 @@ def get_collect_precision_recall_input(wildcards):
     )
 
 
+def get_genome_name(wildcards):
+    if hasattr(wildcards, "benchmark"):
+        return get_benchmark(wildcards.benchmark).get("genome")
+    if hasattr(wildcards, "callset"):
+        benchmark = config["variant-calls"][wildcards.callset]["benchmark"]
+        return get_benchmark(benchmark).get("genome")
+    else:
+        return wildcards.genome
+
+
+def get_genome_version(wildcards):
+    return genomes[get_genome_name(wildcards)].get("version")
+
+
 def get_genome_callsets(genome):
     return sorted(
         callset
diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk
index 30db27d..e99b03a 100644
--- a/workflow/rules/eval.smk
+++ b/workflow/rules/eval.smk
@@ -273,7 +273,10 @@ rule report_precision_recall:
             directory("results/report/precision-recall/{benchmark}/{vartype}"),
             htmlindex="index.html",
             category="precision/recall",
-            labels={"benchmark": "{benchmark}", "vartype": "{vartype}"},
+            labels={
+                "benchmark": "{benchmark}",
+                "vartype": "{vartype}",
+            },
         ),
     log:
         "logs/datavzrd/precision-recall/{benchmark}/{vartype}.log",
@@ -281,6 +284,8 @@ rule report_precision_recall:
         somatic=get_somatic_status,
         vaf=get_vaf_status,
         high_coverage=get_high_coverage_status,
+        genome=get_genome_name,
+        version=get_genome_version,
     wrapper:
         "v3.10.1/utils/datavzrd"
 
@@ -343,5 +348,6 @@ rule report_fp_fn:
         "logs/datavzrd/fp-fn/{genome}/{cov}/{classification}.log",
     params:
         labels=lambda w: get_callsets_labels(get_genome_callsets(w.genome)),
+        version=get_genome_version,
     wrapper:
         "v3.10.1/utils/datavzrd"