broadinstitute · mcovarr · May 8, 2025 · Apr 14, 2025 · Apr 14, 2025 · Apr 15, 2025
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -173,7 +173,7 @@ workflows:
        branches:
          - master
          - ah_var_store
-         - vs_1641_drop_sample_from_vds
+         - vs_1615_hetvar_vcfs
        tags:
          - /.*/
    - name: GvsTieOutVDS
@@ -183,6 +183,7 @@ workflows:
        branches:
          - master
          - ah_var_store
+         - vs_1615_hetvar_vcfs
        tags:
          - /.*/
    - name: GvsExtractCallset
@@ -194,6 +195,16 @@ workflows:
              - ah_var_store
          tags:
              - /.*/
+   - name: HalfBadHetvarExtractCallset
+     subclass: WDL
+     primaryDescriptorPath: /scripts/variantstore/wdl/GvsExtractCallset.wdl
+     filters:
+       branches:
+         - master
+         - ah_var_store
+         - vs_1615_hetvar_vcfs
+       tags:
+         - /.*/
    - name: GvsExtractCallsetShard30603
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsExtractCallset.wdl
@@ -343,9 +354,29 @@ workflows:
          branches:
              - master
              - ah_var_store
-             - vs_1641_drop_sample_from_vds
+             - vs_1615_hetvar_vcfs
          tags:
              - /.*/
+   - name: HalfBadHetvars
+     subclass: WDL
+     primaryDescriptorPath: /scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl
+     filters:
+       branches:
+         - master
+         - ah_var_store
+         - vs_1615_hetvar_vcfs
+       tags:
+         - /.*/
+   - name: MakeHetvarVDS
+     subclass: WDL
+     primaryDescriptorPath: /scripts/variantstore/wdl/test/MakeHetvarVDS.wdl
+     filters:
+       branches:
+         - master
+         - ah_var_store
+         - vs_1615_hetvar_vcfs
+       tags:
+         - /.*/
    - name: GvsIngestTieout
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/test/GvsIngestTieout.wdl

diff --git a/scripts/variantstore/scripts/import_gvs.py b/scripts/variantstore/scripts/import_gvs.py
@@ -378,11 +378,11 @@ def convert_array_with_id_keys_to_dense_array(arr, ids, drop=[]):
             any_yes=acc.any_yes | allele_YES[called_idx - 1],
             any_snp=acc.any_snp | allele_is_snp[called_idx - 1],
             any_indel=acc.any_indel | ~allele_is_snp[called_idx - 1],
-            any_snp_ok=acc.any_snp_ok | (allele_is_snp[called_idx - 1] & allele_OK[called_idx - 1]),
-            any_indel_ok=acc.any_indel_ok | (~allele_is_snp[called_idx - 1] & allele_OK[called_idx - 1]),
-        ), hl.struct(any_no=False, any_yes=False, any_snp=False, any_indel=False, any_snp_ok=False, any_indel_ok=False)))
+            all_snps_ok=acc.all_snps_ok & (~allele_is_snp[called_idx - 1] | allele_OK[called_idx - 1]),
+            all_indels_ok=acc.all_indels_ok & (allele_is_snp[called_idx - 1] | allele_OK[called_idx - 1]),
-            all_indels_ok=acc.all_indels_ok & (allele_is_snp[called_idx - 1] | allele_OK[called_idx - 1]),
+            all_snps_ok=acc.all_snps_ok & (allele_is_snp[called_idx - 1] & allele_OK[called_idx - 1]),
+            all_indels_ok=acc.all_indels_ok & (~allele_is_snp[called_idx - 1] & allele_OK[called_idx - 1]),
-            all_indels_ok=acc.all_indels_ok & (allele_is_snp[called_idx - 1] | allele_OK[called_idx - 1]),
+            all_snps_ok=acc.all_snps_ok & (allele_is_snp[called_idx - 1] & allele_OK[called_idx - 1]),
+            all_indels_ok=acc.all_indels_ok & (~allele_is_snp[called_idx - 1] & allele_OK[called_idx - 1]),
+        ), hl.struct(any_no=False, any_yes=False, any_snp=False, any_indel=False, all_snps_ok=True, all_indels_ok=True)))
 
-        vd = vd.annotate_entries(FT=~ft.any_no & (ft.any_yes | ((~ft.any_snp | ft.any_snp_ok) & (~ft.any_indel | ft.any_indel_ok))))
+        vd = vd.annotate_entries(FT=~ft.any_no & (ft.any_yes | ((~ft.any_snp | ft.all_snps_ok) & (~ft.any_indel | ft.all_indels_ok))))
 
         vd = vd.drop('allele_NO', 'allele_YES', 'allele_is_snp', 'allele_OK')
         hl.vds.VariantDataset(

diff --git a/scripts/variantstore/scripts/merge_and_rescore_vdses.py b/scripts/variantstore/scripts/merge_and_rescore_vdses.py
@@ -153,18 +153,18 @@ def patch_variant_data(vd: hl.MatrixTable, site_filters: hl.Table, vets_filters:
                 any_yes=acc.any_yes | allele_YES[called_idx - 1],
                 any_snp=acc.any_snp | allele_is_snp[called_idx - 1],
                 any_indel=acc.any_indel | ~allele_is_snp[called_idx - 1],
-                any_snp_ok=acc.any_snp_ok
-                           | (allele_is_snp[called_idx - 1] & allele_OK[called_idx - 1]),
-                any_indel_ok=acc.any_indel_ok
-                             | (~allele_is_snp[called_idx - 1] & allele_OK[called_idx - 1]),
+                all_snps_ok=acc.all_snps_ok
+                            & (~allele_is_snp[called_idx - 1] | allele_OK[called_idx - 1]),
+                all_indels_ok=acc.all_indels_ok
+                              & (allele_is_snp[called_idx - 1] | allele_OK[called_idx - 1]),
             ),
             hl.struct(
                 any_no=False,
                 any_yes=False,
                 any_snp=False,
                 any_indel=False,
-                any_snp_ok=False,
-                any_indel_ok=False,
+                all_snps_ok=True,
+                all_indels_ok=True,
             ),
         )
     )
@@ -173,7 +173,7 @@ def patch_variant_data(vd: hl.MatrixTable, site_filters: hl.Table, vets_filters:
         FT=~ft.any_no
            & (
                    ft.any_yes
-                   | ((~ft.any_snp | ft.any_snp_ok) & (~ft.any_indel | ft.any_indel_ok))
+                   | ((~ft.any_snp | ft.all_snps_ok) & (~ft.any_indel | ft.all_indels_ok))
            )
     )
 

diff --git a/scripts/variantstore/scripts/vds_validation.py b/scripts/variantstore/scripts/vds_validation.py
@@ -17,34 +17,34 @@
 
 
 def check_samples_match(vds):
-	print('checking sample equivalence between reference and variant MTs')
-	assert vds.reference_data.cols().select().collect() == vds.variant_data.cols().select().collect()
+    print('checking sample equivalence between reference and variant MTs')
+    assert vds.reference_data.cols().select().collect() == vds.variant_data.cols().select().collect()
 
 def check_ref_blocks(vds):
-	print('checking that:\n  * no reference blocks have GQ=0\n  * all ref blocks have END after start\n  * all ref blocks are max 1000 bases long')
-	rd = vds.reference_data
-	rd = rd.annotate_rows(locus_start = rd.locus.position)
+    print('checking that:\n  * no reference blocks have GQ=0\n  * all ref blocks have END after start\n  * all ref blocks are max 1000 bases long')
+    rd = vds.reference_data
+    rd = rd.annotate_rows(locus_start = rd.locus.position)
 
-	LEN = rd.END - rd.locus_start + 1
+    LEN = rd.END - rd.locus_start + 1
 
-	print('checking that: no reference blocks have GQ=0')
-	assert rd.aggregate_entries(hl.agg.all(hl.all(rd.GQ > 0)))
+    print('checking that: no reference blocks have GQ=0')
+    assert rd.aggregate_entries(hl.agg.all(hl.all(rd.GQ > 0)))
 
-	print('checking that: all ref blocks have END after start')
-	assert rd.aggregate_entries(hl.agg.all(hl.all(LEN >= 0)))
+    print('checking that: all ref blocks have END after start')
+    assert rd.aggregate_entries(hl.agg.all(hl.all(LEN >= 0)))
 
-	print('checking that: all ref blocks are max 1000 bases long')
-	assert rd.aggregate_entries(hl.agg.all(hl.all(LEN <= rd.ref_block_max_length)))
+    print('checking that: all ref blocks are max 1000 bases long')
+    assert rd.aggregate_entries(hl.agg.all(hl.all(LEN <= rd.ref_block_max_length)))
 
 def check_densify_small_region(vds):
-	print('running densify on 200kb region')
-	from time import time
-	t1 = time()
+    print('running densify on 200kb region')
+    from time import time
+    t1 = time()
 
-	filt = hl.vds.filter_intervals(vds, [hl.parse_locus_interval('chr16:29.5M-29.7M', reference_genome='GRCh38')])
-	n=hl.vds.to_dense_mt(filt).select_entries('LGT')._force_count_rows()
+    filt = hl.vds.filter_intervals(vds, [hl.parse_locus_interval('chr16:29.5M-29.7M', reference_genome='GRCh38')])
+    n=hl.vds.to_dense_mt(filt).select_entries('LGT')._force_count_rows()
 
-	print(f'took {time() - t1:.1f}s to densify {n} rows after interval query')
+    print(f'took {time() - t1:.1f}s to densify {n} rows after interval query')
 
 
 

diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl
@@ -129,7 +129,7 @@ task GetToolVersions {
     # GVS generally uses the smallest `alpine` version of the Google Cloud SDK as it suffices for most tasks, but
     # there are a handlful of tasks that require the larger GNU libc-based `slim`.
     String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim"
-    String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2025-04-17-alpine-927ee022e0b8"
+    String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2025-04-18-alpine-364b678b8945"
     String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19"
     String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2025-03-20-gatkbase-728e45646e04"
     String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest"

diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl
@@ -178,7 +178,7 @@ workflow GvsQuickstartIntegration {
                 git_branch_or_tag = git_branch_or_tag,
                 git_hash = GetToolVersions.git_hash,
                 use_VETS = false,
-                extract_do_not_filter_override = true,
+                extract_do_not_filter_override = false,
                 dataset_suffix = "vqsr_vcf",
                 use_default_dockers = use_default_dockers,
                 gatk_override = effective_gatk_override,

diff --git a/scripts/variantstore/wdl/test/MakeHetvarVDS.wdl b/scripts/variantstore/wdl/test/MakeHetvarVDS.wdl
@@ -0,0 +1,105 @@
+version 1.0
+
+import "../GvsUtils.wdl" as Utils
+import "../GvsExtractAvroFilesForHail.wdl" as ExtractAvroFilesForHail
+import "../GvsCreateVDS.wdl" as CreateVds
+import "GvsQuickstartVcfIntegration.wdl" as QuickstartVcfIntegration
+import "GvsTieOutVDS.wdl" as TieOutVDS
+
+workflow MakeHetvarVDS {
+    input {
+        String git_branch_or_tag
+        String? git_hash
+        String reference_name = "hg38"
+        Boolean is_wgs = true
+        String dataset_name
+        String filter_set_name
+        File? interval_list
+        Boolean use_compressed_references = false
+        Boolean extract_do_not_filter_override = false
+        String dataset_suffix = "hail"
+        Boolean use_default_dockers = false
+        Boolean bgzip_output_vcfs = false
+
+        String? basic_docker
+        String? cloud_sdk_docker
+        String? cloud_sdk_slim_docker
+        String? variants_docker
+        String? gatk_docker
+
+        File? gatk_override
+        String? hail_version
+        String? sample_set_name ## NOTE: currently we only allow the loading of one sample set at a time
+
+        String? workspace_bucket
+        String? workspace_id
+        String? submission_id
+
+        Int? maximum_alternate_alleles
+        String ploidy_table_name = "sample_chromosome_ploidy"
+    }
+
+    String project_id = "gvs-internal"
+
+    if (!defined(workspace_bucket) || !defined(workspace_id) || !defined(submission_id) ||
+        !defined(git_hash) || !defined(basic_docker) || !defined(cloud_sdk_docker) || !defined(cloud_sdk_slim_docker) ||
+        !defined(variants_docker) || !defined(gatk_docker) || !defined(hail_version)) {
+        call Utils.GetToolVersions {
+            input:
+                git_branch_or_tag = git_branch_or_tag,
+        }
+    }
+
+    String effective_basic_docker = select_first([basic_docker, GetToolVersions.basic_docker])
+    String effective_cloud_sdk_docker = select_first([cloud_sdk_docker, GetToolVersions.cloud_sdk_docker])
+    String effective_cloud_sdk_slim_docker = select_first([cloud_sdk_slim_docker, GetToolVersions.cloud_sdk_slim_docker])
+    String effective_variants_docker = select_first([variants_docker, GetToolVersions.variants_docker])
+    String effective_gatk_docker = select_first([gatk_docker, GetToolVersions.gatk_docker])
+    String effective_git_hash = select_first([git_hash, GetToolVersions.git_hash])
+    String effective_hail_version = select_first([hail_version, GetToolVersions.hail_version])
+
+    String effective_workspace_bucket = select_first([workspace_bucket, GetToolVersions.workspace_bucket])
+    String effective_workspace_id = select_first([workspace_id, GetToolVersions.workspace_id])
+    String effective_submission_id = select_first([submission_id, GetToolVersions.submission_id])
+
+
+    call ExtractAvroFilesForHail.GvsExtractAvroFilesForHail {
+        input:
+            git_branch_or_tag = git_branch_or_tag,
+            git_hash = git_hash,
+            project_id = project_id,
+            dataset_name = dataset_name,
+            filter_set_name = filter_set_name,
+            ploidy_table_name = ploidy_table_name,
+            scatter_width = 10,
+            call_set_identifier = git_branch_or_tag,
+            basic_docker = effective_basic_docker,
+            cloud_sdk_docker = effective_cloud_sdk_docker,
+            variants_docker = effective_variants_docker,
+    }
+
+    String vds_path = GvsExtractAvroFilesForHail.avro_path + "/gvs_export.vds"
+
+    call CreateVds.GvsCreateVDS {
+        input:
+            git_branch_or_tag = git_branch_or_tag,
+            hail_version = effective_hail_version,
+            avro_path = GvsExtractAvroFilesForHail.avro_path,
+            vds_destination_path = vds_path,
+            cluster_prefix = "vds-cluster",
+            gcs_subnetwork_name = "subnetwork",
+            region = "us-central1",
+            basic_docker = effective_basic_docker,
+            variants_docker = effective_variants_docker,
+            cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker,
+            cluster_max_age_minutes = 120,
+            cluster_max_idle_minutes = 60,
+            leave_cluster_running_at_end = false,
+    }
+
+    output {
+        String vds_output_path = GvsExtractAvroFilesForHail.avro_path + "/gvs_export.vds"
+        String recorded_git_hash = effective_git_hash
+        Boolean done = true
+    }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java
@@ -1006,14 +1006,14 @@ private VariantContext createVariantContextFromSampleRecord(final ExtractCohortR
     boolean isFailingGenotype(final Stream<Allele> nonRefAlleles,
                               final Map<Allele, Double> remappedVQScoreMap,
                               final Double vqScoreThreshold) {
-        // get the max (best) vqslod of the alleles in this genotype
-        Optional<Double> maxVal =
+        // get the min (worst) vqslod of the alleles in this genotype
+        Optional<Double> minVal =
                 nonRefAlleles
                         .map(remappedVQScoreMap::get)
                         .filter(Objects::nonNull)
-                        .max(Double::compareTo);
-        // It's a failing site if the maximum vqlod (if found) is less than the threshold
-        return maxVal.isPresent() && maxVal.get() < vqScoreThreshold;
+                        .min(Double::compareTo);
+        // It's a failing site if the minimum vqlod (if found) is less than the threshold
+        return minVal.isPresent() && minVal.get() < vqScoreThreshold;
     }
 
     private SortingCollection<GenericRecord> createSortedVetCollectionFromBigQuery(final String projectID,

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortVETSEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortVETSEngine.java
@@ -129,13 +129,14 @@ boolean isFailingSite(final Stream<Double> vqScores, final Double vqScoreThresho
     boolean isFailingGenotype(final Stream<Allele> nonRefAlleles,
                               final Map<Allele, Double> remappedVQScoreMap,
                               final Double vqScoreThreshold) {
-        // get the minimum (best) calibration sensitivity for all non-Yay sites, and apply the filter
-        Optional<Double> minVal =
+        // Get the maximum (worst) calibration sensitivity for these non-Yay alleles. If there is an allele, and it is
+        // greater than the vqScoreThreshold, fail the genotype.
+        Optional<Double> maxVal =
                 nonRefAlleles
                         .map(remappedVQScoreMap::get)
                         .filter(Objects::nonNull)
-                        .min(Double::compareTo);
+                        .max(Double::compareTo);
 
-        return minVal.isPresent() && minVal.get() > vqScoreThreshold;
+        return maxVal.isPresent() && maxVal.get() > vqScoreThreshold;
     }
 }