From 533d1d4569bc69a958d6d7ec489aa8a36a9c1279 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <b.p.blankenmeister@gmail.com>
Date: Fri, 13 Dec 2024 09:41:27 -0500
Subject: [PATCH 01/10] Add service account credentialing (#997)

* Add service account credentialing

* ruff
---
 v03_pipeline/lib/misc/gcp.py                  | 36 +++++++++++++++++++
 .../tasks/dataproc/create_dataproc_cluster.py |  5 +++
 .../dataproc/create_dataproc_cluster_test.py  | 18 +++++++++-
 3 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 v03_pipeline/lib/misc/gcp.py

diff --git a/v03_pipeline/lib/misc/gcp.py b/v03_pipeline/lib/misc/gcp.py
new file mode 100644
index 000000000..c22113bf1
--- /dev/null
+++ b/v03_pipeline/lib/misc/gcp.py
@@ -0,0 +1,36 @@
+import datetime
+
+import google.auth
+import google.auth.transport.requests
+import google.oauth2.credentials
+import pytz
+
+SERVICE_ACCOUNT_CREDENTIALS = None
+SOCIAL_AUTH_GOOGLE_OAUTH2_SCOPE = [
+    'https://www.googleapis.com/auth/userinfo.profile',
+    'https://www.googleapis.com/auth/userinfo.email',
+    'openid',
+]
+ONE_MINUTE_S = 60
+
+
+def get_service_account_credentials() -> google.oauth2.credentials.Credentials:
+    global SERVICE_ACCOUNT_CREDENTIALS
+    if not SERVICE_ACCOUNT_CREDENTIALS:
+        SERVICE_ACCOUNT_CREDENTIALS, _ = google.auth.default(
+            scopes=SOCIAL_AUTH_GOOGLE_OAUTH2_SCOPE,
+        )
+    tz = pytz.UTC
+    if (
+        SERVICE_ACCOUNT_CREDENTIALS.token
+        and (
+            tz.localize(SERVICE_ACCOUNT_CREDENTIALS.expiry)
+            - datetime.datetime.now(tz=tz)
+        ).total_seconds()
+        > ONE_MINUTE_S
+    ):
+        return SERVICE_ACCOUNT_CREDENTIALS
+    SERVICE_ACCOUNT_CREDENTIALS.refresh(
+        request=google.auth.transport.requests.Request(),
+    )
+    return SERVICE_ACCOUNT_CREDENTIALS
diff --git a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py
index 3a1d6c9da..f7d0bc21d 100644
--- a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py
+++ b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py
@@ -6,6 +6,7 @@
 from pip._internal.operations import freeze as pip_freeze
 
 from v03_pipeline.lib.logger import get_logger
+from v03_pipeline.lib.misc.gcp import get_service_account_credentials
 from v03_pipeline.lib.model import Env, ReferenceGenome
 from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import (
     BaseLoadingPipelineParams,
@@ -22,9 +23,11 @@
 
 
 def get_cluster_config(reference_genome: ReferenceGenome, run_id: str):
+    service_account_credentials = get_service_account_credentials()
     return {
         'project_id': Env.GCLOUD_PROJECT,
         'cluster_name': f'{CLUSTER_NAME_PREFIX}-{reference_genome.value.lower()}-{run_id}',
+        # Schema found at https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig
         'config': {
             'gce_cluster_config': {
                 'zone_uri': Env.GCLOUD_ZONE,
@@ -35,6 +38,8 @@ def get_cluster_config(reference_genome: ReferenceGenome, run_id: str):
                     'REFERENCE_GENOME': reference_genome.value,
                     'PIPELINE_RUNNER_APP_VERSION': Env.PIPELINE_RUNNER_APP_VERSION,
                 },
+                'service_account': service_account_credentials.service_account_email,
+                'service_account_scopes': service_account_credentials.scopes,
             },
             'master_config': {
                 'num_instances': 1,
diff --git a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py
index bb447e0d6..c7f4f2958 100644
--- a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py
+++ b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py
@@ -5,17 +5,29 @@
 import google.api_core.exceptions
 import luigi
 
+from v03_pipeline.lib.misc.gcp import SOCIAL_AUTH_GOOGLE_OAUTH2_SCOPE
 from v03_pipeline.lib.model import DatasetType, ReferenceGenome
 from v03_pipeline.lib.tasks.dataproc.create_dataproc_cluster import (
     CreateDataprocClusterTask,
 )
 
 
+@patch(
+    'v03_pipeline.lib.tasks.dataproc.create_dataproc_cluster.get_service_account_credentials',
+    return_value=SimpleNamespace(
+        service_account_email='test@serviceaccount.com',
+        scopes=SOCIAL_AUTH_GOOGLE_OAUTH2_SCOPE,
+    ),
+)
 @patch(
     'v03_pipeline.lib.tasks.dataproc.create_dataproc_cluster.dataproc.ClusterControllerClient',
 )
 class CreateDataprocClusterTaskTest(unittest.TestCase):
-    def test_dataset_type_unsupported(self, mock_cluster_controller: Mock) -> None:
+    def test_dataset_type_unsupported(
+        self,
+        mock_cluster_controller: Mock,
+        _: Mock,
+    ) -> None:
         worker = luigi.worker.Worker()
         task = CreateDataprocClusterTask(
             reference_genome=ReferenceGenome.GRCh38,
@@ -29,6 +41,7 @@ def test_dataset_type_unsupported(self, mock_cluster_controller: Mock) -> None:
     def test_spinup_cluster_already_exists_failed(
         self,
         mock_cluster_controller: Mock,
+        _: Mock,
     ) -> None:
         mock_client = mock_cluster_controller.return_value
         mock_client.get_cluster.return_value = SimpleNamespace(
@@ -50,6 +63,7 @@ def test_spinup_cluster_already_exists_failed(
     def test_spinup_cluster_already_exists_success(
         self,
         mock_cluster_controller: Mock,
+        _: Mock,
     ) -> None:
         mock_client = mock_cluster_controller.return_value
         mock_client.get_cluster.return_value = SimpleNamespace(
@@ -73,6 +87,7 @@ def test_spinup_cluster_doesnt_exist_failed(
         self,
         mock_logger: Mock,
         mock_cluster_controller: Mock,
+        _: Mock,
     ) -> None:
         mock_client = mock_cluster_controller.return_value
         mock_client.get_cluster.side_effect = google.api_core.exceptions.NotFound(
@@ -98,6 +113,7 @@ def test_spinup_cluster_doesnt_exist_success(
         self,
         mock_logger: Mock,
         mock_cluster_controller: Mock,
+        _: Mock,
     ) -> None:
         mock_client = mock_cluster_controller.return_value
         mock_client.get_cluster.side_effect = google.api_core.exceptions.NotFound(

From b98fe664c8733e8edb0faf39ff042827a362a93d Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <b.p.blankenmeister@gmail.com>
Date: Fri, 13 Dec 2024 11:13:20 -0500
Subject: [PATCH 02/10] feat: Handle parsing empty predicted sex into Unknown
 (#1000)

---
 v03_pipeline/lib/misc/io.py                          |  6 +++++-
 v03_pipeline/lib/misc/io_test.py                     |  1 +
 v03_pipeline/lib/model/definitions.py                | 10 +++++-----
 v03_pipeline/var/test/sex_check/test_imputed_sex.tsv |  1 +
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/v03_pipeline/lib/misc/io.py b/v03_pipeline/lib/misc/io.py
index 865cdf5b3..cc31ffba1 100644
--- a/v03_pipeline/lib/misc/io.py
+++ b/v03_pipeline/lib/misc/io.py
@@ -219,7 +219,11 @@ def select_relevant_fields(
 def import_imputed_sex(imputed_sex_path: str) -> hl.Table:
     ht = hl.import_table(imputed_sex_path)
     imputed_sex_lookup = hl.dict(
-        {s.imputed_sex_value: s.value for s in Sex},
+        {
+            imputed_sex_value: s.value
+            for s in Sex
+            for imputed_sex_value in s.imputed_sex_values
+        },
     )
     ht = ht.select(
         s=ht.collaborator_sample_id,
diff --git a/v03_pipeline/lib/misc/io_test.py b/v03_pipeline/lib/misc/io_test.py
index f392c5c76..c9b4f3bce 100644
--- a/v03_pipeline/lib/misc/io_test.py
+++ b/v03_pipeline/lib/misc/io_test.py
@@ -47,6 +47,7 @@ def test_import_imputed_sex(self) -> None:
                 hl.Struct(s='abc_2', predicted_sex='F'),
                 hl.Struct(s='abc_3', predicted_sex='U'),
                 hl.Struct(s='abc_4', predicted_sex='XYY'),
+                hl.Struct(s='abc_5', predicted_sex='U'),
             ],
         )
 
diff --git a/v03_pipeline/lib/model/definitions.py b/v03_pipeline/lib/model/definitions.py
index a87437c8f..09b10fcbd 100644
--- a/v03_pipeline/lib/model/definitions.py
+++ b/v03_pipeline/lib/model/definitions.py
@@ -18,12 +18,12 @@ class Sex(str, Enum):
     XYY = 'XYY'
 
     @property
-    def imputed_sex_value(self):
+    def imputed_sex_values(self) -> list[str]:
         return {
-            Sex.MALE: 'Male',
-            Sex.FEMALE: 'Female',
-            Sex.UNKNOWN: 'Unknown',
-        }.get(self, self.name)
+            Sex.MALE: ['Male'],
+            Sex.FEMALE: ['Female'],
+            Sex.UNKNOWN: ['', 'Unknown'],
+        }.get(self, [self.name])
 
 
 class PipelineVersion(str, Enum):
diff --git a/v03_pipeline/var/test/sex_check/test_imputed_sex.tsv b/v03_pipeline/var/test/sex_check/test_imputed_sex.tsv
index a22b0e26a..f0ae98e6c 100644
--- a/v03_pipeline/var/test/sex_check/test_imputed_sex.tsv
+++ b/v03_pipeline/var/test/sex_check/test_imputed_sex.tsv
@@ -3,3 +3,4 @@ SM-DM66X	abc_1	abc_1	0E+00	gs://datarepo-9cafeffd-bucket/f511b131-3f0d-4eb7-a7f0
 SM-DM69X	abc_2	abc_2	0E+00	gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c4c07edf-7735-4aa7-9283-7cb2607b60a2/GLE-5774-3-3.qc-coverage-region-1_coverage_metrics.csv	gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/dcd4c271-0249-47f1-8e91-81f74735c5a1/GLE-5774-3-3.cram.crai	gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/ec41ec06-673f-4fe2-a063-23dc5fe1dcce/GLE-5774-3-3.cram.md5sum	gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/aad0e270-2ad5-4f39-b968-9b4beafeb5cc/GLE-5774-3-3.cram	a4b04a39-9234-4028-a155-442c4acf12a0	07.021.604.3.7.8	ce74d94c-c33d-49d7-85c9-5f3cbd08aff7	2024-04-17T15:02:46	99.800000000	gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c3a9e6f2-4c68-410b-823d-46ca406e5061/GLE-5774-3-3.mapping_metrics.csv	DNA:DNA Genomic	35.300000000	Whole Blood:Whole Blood	PT-24OHM	Pass	PDO-32755	96.320000000	97.340000000	Female	P-WG-0139	2017-04-12 04:00:00	Female	RP-3061	gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c71cd2a1-c789-4715-9ebc-dbfc40d9f2e2/GLE-5774-3-3.vcf.gz.tbi	gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/957a99cb-c9a9-4fc5-a0ec-53f9e461469e/GLE-5774-3-3.vcf.gz.md5sum	gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/df520949-5f2b-4976-9d46-80d1cc299813/GLE-5774-3-3.vcf.gz	133253714921.000000000	gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/2e98e51b-9394-4e64-977f-e9010a4e16dc/GLE-5774-3-3.vc_metrics.csv
 SM-DPB5G	abc_3	abc_3	0E+00	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/432f8354-77e0-4381-9bb5-dfdc0633b5b2/PIE_OGI1433_002628_1.qc-coverage-region-1_coverage_metrics.csv	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/3dc623fa-2a45-4b3d-a0f8-fcdec09f9418/PIE_OGI1433_002628_1.cram.crai	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/895966ef-c705-4c18-952d-03863243a184/PIE_OGI1433_002628_1.cram.md5sum	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/96ca6d5f-fb23-4102-bb5e-c7bbfd194e1c/PIE_OGI1433_002628_1.cram	ffb50687-165e-425a-a545-c3797d3a28d4	07.021.604.3.7.8	55729ba9-3ce4-47b3-9c3b-1148737ae40f	2024-04-17T15:07:57	99.670000000	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/30f8e208-5d2d-4ce8-b835-695b5ed673f4/PIE_OGI1433_002628_1.mapping_metrics.csv	DNA:DNA Genomic	41.910000000	Whole Blood:Whole Blood	PT-25BR5	Pass	PDO-32756	92.920000000	97.990000000	Unknown	P-WG-0139	2017-05-19 04:00:00	Unknown	RP-3062	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/1641d1b2-1035-4cc3-9c8b-0c8cb430f56b/PIE_OGI1433_002628_1.vcf.gz.tbi	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/f5ba2708-899e-42e8-b287-fdf72c2e404d/PIE_OGI1433_002628_1.vcf.gz.md5sum	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/e925ee5d-a75e-471f-adfd-2756c8690069/PIE_OGI1433_002628_1.vcf.gz	156149580126.000000000	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/df076bc5-9db8-44f0-a3fe-f693370634cc/PIE_OGI1433_002628_1.vc_metrics.csv
 SM-DPB5G	abc_4	abc_4	0E+00	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/432f8354-77e0-4381-9bb5-dfdc0633b5b2/PIE_OGI1433_002628_1.qc-coverage-region-1_coverage_metrics.csv	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/3dc623fa-2a45-4b3d-a0f8-fcdec09f9418/PIE_OGI1433_002628_1.cram.crai	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/895966ef-c705-4c18-952d-03863243a184/PIE_OGI1433_002628_1.cram.md5sum	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/96ca6d5f-fb23-4102-bb5e-c7bbfd194e1c/PIE_OGI1433_002628_1.cram	ffb50687-165e-425a-a545-c3797d3a28d4	07.021.604.3.7.8	55729ba9-3ce4-47b3-9c3b-1148737ae40f	2024-04-17T15:07:57	99.670000000	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/30f8e208-5d2d-4ce8-b835-695b5ed673f4/PIE_OGI1433_002628_1.mapping_metrics.csv	DNA:DNA Genomic	41.910000000	Whole Blood:Whole Blood	PT-25BR5	Pass	PDO-32756	92.920000000	97.990000000	XYY	P-WG-0139	2017-05-19 04:00:00	XYY	RP-3062	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/1641d1b2-1035-4cc3-9c8b-0c8cb430f56b/PIE_OGI1433_002628_1.vcf.gz.tbi	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/f5ba2708-899e-42e8-b287-fdf72c2e404d/PIE_OGI1433_002628_1.vcf.gz.md5sum	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/e925ee5d-a75e-471f-adfd-2756c8690069/PIE_OGI1433_002628_1.vcf.gz	156149580126.000000000	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/df076bc5-9db8-44f0-a3fe-f693370634cc/PIE_OGI1433_002628_1.vc_metrics.csv
+SM-DPB5G	abc_5	abc_5	0E+00	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/432f8354-77e0-4381-9bb5-dfdc0633b5b2/PIE_OGI1433_002628_1.qc-coverage-region-1_coverage_metrics.csv	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/3dc623fa-2a45-4b3d-a0f8-fcdec09f9418/PIE_OGI1433_002628_1.cram.crai	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/895966ef-c705-4c18-952d-03863243a184/PIE_OGI1433_002628_1.cram.md5sum	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/96ca6d5f-fb23-4102-bb5e-c7bbfd194e1c/PIE_OGI1433_002628_1.cram	ffb50687-165e-425a-a545-c3797d3a28d4	07.021.604.3.7.8	55729ba9-3ce4-47b3-9c3b-1148737ae40f	2024-04-17T15:07:57	99.670000000	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/30f8e208-5d2d-4ce8-b835-695b5ed673f4/PIE_OGI1433_002628_1.mapping_metrics.csv	DNA:DNA Genomic	41.910000000	Whole Blood:Whole Blood	PT-25BR5	Pass	PDO-32756	92.920000000	97.990000000		P-WG-0139	2017-05-19 04:00:00		RP-3062	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/1641d1b2-1035-4cc3-9c8b-0c8cb430f56b/PIE_OGI1433_002628_1.vcf.gz.tbi	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/f5ba2708-899e-42e8-b287-fdf72c2e404d/PIE_OGI1433_002628_1.vcf.gz.md5sum	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/e925ee5d-a75e-471f-adfd-2756c8690069/PIE_OGI1433_002628_1.vcf.gz	156149580126.000000000	gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/df076bc5-9db8-44f0-a3fe-f693370634cc/PIE_OGI1433_002628_1.vc_metrics.csv

From 8b58c01aa462f9b6ad35bc29954f124c03c26e67 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <b.p.blankenmeister@gmail.com>
Date: Fri, 13 Dec 2024 11:48:37 -0500
Subject: [PATCH 03/10] Add helper functions for querying `Terra Data
 Repository` (#998)

* Add service account credentialing

* ruff

* First pass

* tests passing

* add coverage of bigquery test

* change function names

* use generators everywhere

* bq requirement

* resolver

* Update sample id name

* Build Sex Check Table from TDR Metrics (#999)
---
 requirements.in                               |   1 +
 requirements.txt                              |  22 +-
 v03_pipeline/lib/misc/allele_registry.py      |  10 +-
 v03_pipeline/lib/misc/requests.py             |  14 +
 .../lib/misc/terra_data_repository.py         |  68 ++++
 .../lib/misc/terra_data_repository_test.py    | 303 ++++++++++++++++++
 v03_pipeline/lib/model/dataset_type.py        |   8 +
 v03_pipeline/lib/model/environment.py         |   2 +
 v03_pipeline/lib/paths.py                     |  17 +-
 v03_pipeline/lib/paths_test.py                |  10 +-
 .../lib/tasks/base/base_loading_run_params.py |   4 +
 .../lib/tasks/write_imported_callset.py       |  12 +
 .../lib/tasks/write_sex_check_table.py        |  24 +-
 .../lib/tasks/write_sex_check_table_test.py   |  96 ++++++
 .../lib/tasks/write_tdr_metrics_file.py       |  35 ++
 .../lib/tasks/write_tdr_metrics_files.py      |  28 ++
 16 files changed, 623 insertions(+), 31 deletions(-)
 create mode 100644 v03_pipeline/lib/misc/requests.py
 create mode 100644 v03_pipeline/lib/misc/terra_data_repository.py
 create mode 100644 v03_pipeline/lib/misc/terra_data_repository_test.py
 create mode 100644 v03_pipeline/lib/tasks/write_sex_check_table_test.py
 create mode 100644 v03_pipeline/lib/tasks/write_tdr_metrics_file.py
 create mode 100644 v03_pipeline/lib/tasks/write_tdr_metrics_files.py

diff --git a/requirements.in b/requirements.in
index 6d5abb8b9..717740a88 100644
--- a/requirements.in
+++ b/requirements.in
@@ -4,3 +4,4 @@ gnomad==0.6.4
 aiofiles==24.1.0
 pydantic==2.8.2
 google-cloud-dataproc==5.14.0
+google-cloud-bigquery==3.27.0
diff --git a/requirements.txt b/requirements.txt
index 9762fbca9..86e9f058b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -97,17 +97,30 @@ frozenlist==1.5.0
 gnomad==0.6.4
     # via -r requirements.in
 google-api-core[grpc]==2.22.0
-    # via google-cloud-dataproc
+    # via
+    #   google-cloud-bigquery
+    #   google-cloud-core
+    #   google-cloud-dataproc
 google-auth==2.35.0
     # via
     #   google-api-core
     #   google-auth-oauthlib
+    #   google-cloud-bigquery
+    #   google-cloud-core
     #   google-cloud-dataproc
     #   hail
 google-auth-oauthlib==0.8.0
     # via hail
+google-cloud-bigquery==3.27.0
+    # via -r requirements.in
+google-cloud-core==2.4.1
+    # via google-cloud-bigquery
 google-cloud-dataproc==5.14.0
     # via -r requirements.in
+google-crc32c==1.6.0
+    # via google-resumable-media
+google-resumable-media==2.7.2
+    # via google-cloud-bigquery
 googleapis-common-protos[grpc]==1.65.0
     # via
     #   google-api-core
@@ -197,6 +210,7 @@ orjson==3.10.10
 packaging==24.1
     # via
     #   bokeh
+    #   google-cloud-bigquery
     #   plotly
 pandas==2.2.3
     # via
@@ -256,9 +270,7 @@ pygments==2.18.0
     #   ipython
     #   rich
 pyjwt[crypto]==2.9.0
-    # via
-    #   msal
-    #   pyjwt
+    # via msal
 pyspark==3.5.3
     # via hail
 python-daemon==3.1.0
@@ -266,6 +278,7 @@ python-daemon==3.1.0
 python-dateutil==2.9.0.post0
     # via
     #   botocore
+    #   google-cloud-bigquery
     #   luigi
     #   pandas
 python-json-logger==2.0.7
@@ -282,6 +295,7 @@ requests==2.32.3
     # via
     #   azure-core
     #   google-api-core
+    #   google-cloud-bigquery
     #   hail
     #   msal
     #   msrest
diff --git a/v03_pipeline/lib/misc/allele_registry.py b/v03_pipeline/lib/misc/allele_registry.py
index 77023a9ba..6129d7e50 100644
--- a/v03_pipeline/lib/misc/allele_registry.py
+++ b/v03_pipeline/lib/misc/allele_registry.py
@@ -8,9 +8,9 @@
 import hailtop.fs as hfs
 import requests
 from requests import HTTPError
-from requests.adapters import HTTPAdapter, Retry
 
 from v03_pipeline.lib.logger import get_logger
+from v03_pipeline.lib.misc.requests import requests_retry_session
 from v03_pipeline.lib.model import Env, ReferenceGenome
 
 MAX_VARIANTS_PER_REQUEST = 1000000
@@ -96,13 +96,7 @@ def register_alleles(
     logger.info('Calling the ClinGen Allele Registry')
     with hfs.open(formatted_vcf_file_name, 'r') as vcf_in:
         data = vcf_in.read()
-        s = requests.Session()
-        retries = Retry(
-            total=5,
-            backoff_factor=1,
-            status_forcelist=[500, 502, 503, 504],
-        )
-        s.mount('https://', HTTPAdapter(max_retries=retries))
+        s = requests_retry_session()
         res = s.put(
             url=build_url(base_url, reference_genome),
             data=data,
diff --git a/v03_pipeline/lib/misc/requests.py b/v03_pipeline/lib/misc/requests.py
new file mode 100644
index 000000000..e0ee66f36
--- /dev/null
+++ b/v03_pipeline/lib/misc/requests.py
@@ -0,0 +1,14 @@
+import requests
+from requests.adapters import HTTPAdapter, Retry
+
+
+def requests_retry_session():
+    s = requests.Session()
+    retries = Retry(
+        total=5,
+        backoff_factor=1,
+        status_forcelist=[500, 502, 503, 504],
+    )
+    s.mount('http://', HTTPAdapter(max_retries=retries))
+    s.mount('https://', HTTPAdapter(max_retries=retries))
+    return s
diff --git a/v03_pipeline/lib/misc/terra_data_repository.py b/v03_pipeline/lib/misc/terra_data_repository.py
new file mode 100644
index 000000000..d55b3ff37
--- /dev/null
+++ b/v03_pipeline/lib/misc/terra_data_repository.py
@@ -0,0 +1,68 @@
+import os
+import re
+from collections.abc import Generator
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import google.cloud.bigquery
+from google.cloud import bigquery
+
+from v03_pipeline.lib.misc.gcp import get_service_account_credentials
+from v03_pipeline.lib.misc.requests import requests_retry_session
+
+BIGQUERY_METRICS = [
+    'collaborator_sample_id',
+    'predicted_sex',
+]
+BIGQUERY_RESOURCE = 'bigquery'
+TABLE_NAME_VALIDATION_REGEX = r'datarepo-\w+.datarepo_\w+'
+TDR_ROOT_URL = 'https://data.terra.bio/api/repository/v1/'
+
+
+def _tdr_request(resource: str) -> dict:
+    service_account_token = get_service_account_credentials().token
+    s = requests_retry_session()
+    res = s.get(
+        url=os.path.join(TDR_ROOT_URL, resource),
+        headers={'Authorization': f'Bearer {service_account_token}'},
+        timeout=10,
+    )
+    res.raise_for_status()
+    return res.json()
+
+
+def _get_dataset_ids() -> list[str]:
+    res_body = _tdr_request('datasets')
+    items = res_body['items']
+    for item in items:
+        if not any(x['cloudResource'] == BIGQUERY_RESOURCE for x in item['storage']):
+            # Hard failure on purpose to prompt manual investigation.
+            msg = 'Datasets without bigquery sources are unsupported'
+            raise ValueError(msg)
+    return [x['id'] for x in items]
+
+
+def gen_bq_table_names() -> Generator[str]:
+    with ThreadPoolExecutor(max_workers=5) as executor:
+        futures = [
+            executor.submit(
+                _tdr_request,
+                f'datasets/{dataset_id}?include=ACCESS_INFORMATION',
+            )
+            for dataset_id in _get_dataset_ids()
+        ]
+        for future in as_completed(futures):
+            result = future.result()
+            yield f"{result['accessInformation']['bigQuery']['projectId']}.{result['accessInformation']['bigQuery']['datasetName']}"
+
+
+def bq_metrics_query(bq_table_name: str) -> google.cloud.bigquery.table.RowIterator:
+    if not re.match(TABLE_NAME_VALIDATION_REGEX, bq_table_name):
+        msg = f'{bq_table_name} does not match expected pattern'
+        raise ValueError(msg)
+    client = bigquery.Client()
+    return client.query_and_wait(
+        f"""
+        SELECT {','.join(BIGQUERY_METRICS)}
+        FROM `{bq_table_name}.sample`
+    """,  # noqa: S608
+    )
diff --git a/v03_pipeline/lib/misc/terra_data_repository_test.py b/v03_pipeline/lib/misc/terra_data_repository_test.py
new file mode 100644
index 000000000..54646e193
--- /dev/null
+++ b/v03_pipeline/lib/misc/terra_data_repository_test.py
@@ -0,0 +1,303 @@
+import json
+import os
+import unittest
+from types import SimpleNamespace
+from unittest.mock import Mock, patch
+
+import responses
+
+from v03_pipeline.lib.misc.terra_data_repository import (
+    TDR_ROOT_URL,
+    _get_dataset_ids,
+    gen_bq_table_names,
+)
+
+TDR_DATASETS = [
+    {
+        'id': '2dc51ee0-a037-499d-a915-a7c20a0b216d',
+        'name': 'RP_3053',
+        'description': 'TGG_Ware_DRAGEN_hg38. Dataset automatically created and linked to: RP-3053',
+        'defaultProfileId': '0a164b9a-2b8b-45d2-859e-a4e369b9cb4f',
+        'createdDate': '2023-10-05T13:15:27.649760Z',
+        'storage': [
+            {
+                'region': 'us-central1',
+                'cloudResource': 'bigquery',
+                'cloudPlatform': 'gcp',
+            },
+            {
+                'region': 'us-east4',
+                'cloudResource': 'firestore',
+                'cloudPlatform': 'gcp',
+            },
+            {
+                'region': 'us-central1',
+                'cloudResource': 'bucket',
+                'cloudPlatform': 'gcp',
+            },
+        ],
+        'secureMonitoringEnabled': False,
+        'cloudPlatform': 'gcp',
+        'dataProject': 'datarepo-7242affb',
+        'storageAccount': None,
+        'phsId': None,
+        'selfHosted': False,
+        'predictableFileIds': False,
+        'tags': [],
+        'resourceLocks': {
+            'exclusive': None,
+            'shared': [],
+        },
+    },
+    {
+        'id': 'beef77e8-575b-40e5-9340-a6f10e0bec67',
+        'name': 'RP_3056',
+        'description': 'RGP_HMB_DRAGEN_hg38. Dataset automatically created and linked to: RP-3056',
+        'defaultProfileId': '835dd05d-c603-4b0d-926f-d9143fd24549',
+        'createdDate': '2023-10-05T20:15:27.622481Z',
+        'storage': [
+            {
+                'region': 'us-central1',
+                'cloudResource': 'bigquery',
+                'cloudPlatform': 'gcp',
+            },
+            {
+                'region': 'us-east4',
+                'cloudResource': 'firestore',
+                'cloudPlatform': 'gcp',
+            },
+            {
+                'region': 'us-central1',
+                'cloudResource': 'bucket',
+                'cloudPlatform': 'gcp',
+            },
+        ],
+        'secureMonitoringEnabled': False,
+        'cloudPlatform': 'gcp',
+        'dataProject': 'datarepo-5a72e31b',
+        'storageAccount': None,
+        'phsId': None,
+        'selfHosted': False,
+        'predictableFileIds': False,
+        'tags': [],
+        'resourceLocks': {
+            'exclusive': None,
+            'shared': [],
+        },
+    },
+    {
+        'id': 'c8d74ac4-9a2e-4d3d-a6b5-1f4b433d949f',
+        'name': 'RP_3055',
+        'description': 'RGP_GRU_DRAGEN_hg38. Dataset automatically created and linked to: RP-3055',
+        'defaultProfileId': '835dd05d-c603-4b0d-926f-d9143fd24549',
+        'createdDate': '2023-10-05T20:15:28.255304Z',
+        'storage': [
+            {
+                'region': 'us-central1',
+                'cloudResource': 'bigquery',
+                'cloudPlatform': 'gcp',
+            },
+            {
+                'region': 'us-east4',
+                'cloudResource': 'firestore',
+                'cloudPlatform': 'gcp',
+            },
+            {
+                'region': 'us-central1',
+                'cloudResource': 'bucket',
+                'cloudPlatform': 'gcp',
+            },
+        ],
+        'secureMonitoringEnabled': False,
+        'cloudPlatform': 'gcp',
+        'dataProject': 'datarepo-0f5be351',
+        'storageAccount': None,
+        'phsId': None,
+        'selfHosted': False,
+        'predictableFileIds': False,
+        'tags': [],
+        'resourceLocks': {
+            'exclusive': None,
+            'shared': [
+                'Gw_KTeYRS1aLhRvapMLYLg',
+                'WrP-0w1aROOUbgkI8JS6Ug',
+            ],
+        },
+    },
+]
+
+
+@patch(
+    'v03_pipeline.lib.misc.terra_data_repository.get_service_account_credentials',
+    return_value=SimpleNamespace(
+        token='abcdefg',  # noqa: S106
+    ),
+)
+class TerraDataRepositoryTest(unittest.TestCase):
+    @responses.activate
+    def test_get_dataset_ids(self, _: Mock) -> None:
+        responses.get(
+            os.path.join(TDR_ROOT_URL, 'datasets'),
+            body=json.dumps(
+                {
+                    'total': 3,
+                    'filteredTotal': 3,
+                    'items': TDR_DATASETS,
+                },
+            ),
+        )
+        self.assertListEqual(
+            _get_dataset_ids(),
+            [
+                '2dc51ee0-a037-499d-a915-a7c20a0b216d',
+                'beef77e8-575b-40e5-9340-a6f10e0bec67',
+                'c8d74ac4-9a2e-4d3d-a6b5-1f4b433d949f',
+            ],
+        )
+
+    @responses.activate
+    def test_get_dataset_ids_no_bq(self, _: Mock) -> None:
+        responses.get(
+            os.path.join(TDR_ROOT_URL, 'datasets'),
+            body=json.dumps(
+                {
+                    'total': 1,
+                    'filteredTotal': 1,
+                    'items': [
+                        {
+                            'id': '2dc51ee0-a037-499d-a915-a7c20a0b216d',
+                            'name': 'RP_3053',
+                            'description': 'TGG_Ware_DRAGEN_hg38. Dataset automatically created and linked to: RP-3053',
+                            'defaultProfileId': '0a164b9a-2b8b-45d2-859e-a4e369b9cb4f',
+                            'createdDate': '2023-10-05T13:15:27.649760Z',
+                            'storage': [
+                                # NB: bigquery was removed from 'storage' here.
+                                {
+                                    'region': 'us-east4',
+                                    'cloudResource': 'firestore',
+                                    'cloudPlatform': 'gcp',
+                                },
+                                {
+                                    'region': 'us-central1',
+                                    'cloudResource': 'bucket',
+                                    'cloudPlatform': 'gcp',
+                                },
+                            ],
+                            'secureMonitoringEnabled': False,
+                            'cloudPlatform': 'gcp',
+                            'dataProject': 'datarepo-7242affb',
+                            'storageAccount': None,
+                            'phsId': None,
+                            'selfHosted': False,
+                            'predictableFileIds': False,
+                            'tags': [],
+                            'resourceLocks': {
+                                'exclusive': None,
+                                'shared': [],
+                            },
+                        },
+                    ],
+                },
+            ),
+        )
+        self.assertRaises(
+            ValueError,
+            _get_dataset_ids,
+        )
+
+    @responses.activate
+    def test_gen_bq_table_names(self, _: Mock) -> None:
+        responses.get(
+            os.path.join(TDR_ROOT_URL, 'datasets'),
+            body=json.dumps(
+                {
+                    'total': 3,
+                    'filteredTotal': 3,
+                    'items': TDR_DATASETS,
+                },
+            ),
+        )
+        for dataset_id, name, project_name, dataset_name in [
+            (
+                '2dc51ee0-a037-499d-a915-a7c20a0b216d',
+                'RP_3053',
+                'datarepo-7242affb',
+                'datarepo_RP_3053',
+            ),
+            (
+                'beef77e8-575b-40e5-9340-a6f10e0bec67',
+                'RP_3056',
+                'datarepo-5a72e31b',
+                'datarepo_RP_3056',
+            ),
+            (
+                'c8d74ac4-9a2e-4d3d-a6b5-1f4b433d949f',
+                'RP_3059',
+                'datarepo-aada2e3b',
+                'datarepo_RP_3059',
+            ),
+        ]:
+            responses.get(
+                os.path.join(
+                    TDR_ROOT_URL,
+                    f'datasets/{dataset_id}?include=ACCESS_INFORMATION',
+                ),
+                body=json.dumps(
+                    {
+                        'id': dataset_id,
+                        'name': name,
+                        'description': 'TGG_Ware_DRAGEN_hg38. Dataset automatically created and linked to: RP-3053',
+                        'defaultProfileId': None,
+                        'dataProject': None,
+                        'defaultSnapshotId': None,
+                        'schema': None,
+                        'createdDate': '2023-10-05T13:15:27.649760Z',
+                        'storage': None,
+                        'secureMonitoringEnabled': False,
+                        'phsId': None,
+                        'accessInformation': {
+                            'bigQuery': {
+                                'datasetName': dataset_name,
+                                'datasetId': f'{project_name}:{dataset_name}',
+                                'projectId': project_name,
+                                'link': 'https://console.cloud.google.com/bigquery?project=datarepo-7242affb&ws=!datarepo_RP_3053&d=datarepo_RP_3053&p=datarepo-7242affb&page=dataset',
+                                'tables': [
+                                    {
+                                        'name': 'jointcallset',
+                                        'id': f'{project_name}.{dataset_name}.jointcallset',
+                                        'qualifiedName': 'datarepo-7242affb.datarepo_RP_3053.jointcallset',
+                                        'link': 'https://console.cloud.google.com/bigquery?project=datarepo-7242affb&ws=!datarepo_RP_3053&d=datarepo_RP_3053&p=datarepo-7242affb&page=table&t=jointcallset',
+                                        'sampleQuery': 'SELECT * FROM `datarepo-7242affb.datarepo_RP_3053.jointcallset`',
+                                    },
+                                    {
+                                        'name': 'sample',
+                                        'id': f'{project_name}.{dataset_name}.sample',
+                                        'qualifiedName': 'datarepo-7242affb.datarepo_RP_3053.sample',
+                                        'link': 'https://console.cloud.google.com/bigquery?project=datarepo-7242affb&ws=!datarepo_RP_3053&d=datarepo_RP_3053&p=datarepo-7242affb&page=table&t=sample',
+                                        'sampleQuery': 'SELECT * FROM `datarepo-7242affb.datarepo_RP_3053.sample`',
+                                    },
+                                ],
+                            },
+                            'parquet': None,
+                        },
+                        'cloudPlatform': None,
+                        'selfHosted': False,
+                        'properties': None,
+                        'ingestServiceAccount': None,
+                        'predictableFileIds': False,
+                        'tags': [],
+                        'resourceLocks': {
+                            'exclusive': None,
+                            'shared': [],
+                        },
+                    },
+                ),
+            )
+        self.assertCountEqual(
+            list(gen_bq_table_names()),
+            [
+                'datarepo-7242affb.datarepo_RP_3053',
+                'datarepo-5a72e31b.datarepo_RP_3056',
+                'datarepo-aada2e3b.datarepo_RP_3059',
+            ],
+        )
diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py
index 281cffd9b..e7e8a28d3 100644
--- a/v03_pipeline/lib/model/dataset_type.py
+++ b/v03_pipeline/lib/model/dataset_type.py
@@ -186,6 +186,14 @@ def expect_filters(
     ) -> bool:
         return self == DatasetType.SNV_INDEL and sample_type == SampleType.WES
 
+    def expect_tdr_metrics(
+        self,
+        reference_genome: ReferenceGenome,
+    ) -> bool:
+        return (
+            self == DatasetType.SNV_INDEL and reference_genome == ReferenceGenome.GRCh38
+        )
+
     @property
     def has_gencode_gene_symbol_to_gene_id_mapping(self) -> bool:
         return self == DatasetType.SV
diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py
index d95d7a27c..e307b0226 100644
--- a/v03_pipeline/lib/model/environment.py
+++ b/v03_pipeline/lib/model/environment.py
@@ -56,6 +56,7 @@
     os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1'
 )
 CHECK_SEX_AND_RELATEDNESS = os.environ.get('CHECK_SEX_AND_RELATEDNESS') == '1'
+EXPECT_TDR_METRICS = os.environ.get('EXPECT_TDR_METRICS') == '1'
 EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1'
 INCLUDE_PIPELINE_VERSION_IN_PREFIX = (
     os.environ.get('INCLUDE_PIPELINE_VERSION_IN_PREFIX') == '1'
@@ -72,6 +73,7 @@ class Env:
     CLINGEN_ALLELE_REGISTRY_LOGIN: str | None = CLINGEN_ALLELE_REGISTRY_LOGIN
     CLINGEN_ALLELE_REGISTRY_PASSWORD: str | None = CLINGEN_ALLELE_REGISTRY_PASSWORD
     DEPLOYMENT_TYPE: Literal['dev', 'prod'] = DEPLOYMENT_TYPE
+    EXPECT_TDR_METRICS: bool = EXPECT_TDR_METRICS
     EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS
     GCLOUD_DATAPROC_SECONDARY_WORKERS: str = GCLOUD_DATAPROC_SECONDARY_WORKERS
     GCLOUD_PROJECT: str | None = GCLOUD_PROJECT
diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py
index 68d27bba2..5a54880fd 100644
--- a/v03_pipeline/lib/paths.py
+++ b/v03_pipeline/lib/paths.py
@@ -98,10 +98,9 @@ def family_table_path(
     )
 
 
-def imputed_sex_path(
+def tdr_metrics_dir(
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,
-    callset_path: str,
 ) -> str:
     return os.path.join(
         _pipeline_prefix(
@@ -109,8 +108,18 @@ def imputed_sex_path(
             reference_genome,
             dataset_type,
         ),
-        'imputed_sex',
-        f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.tsv',
+        'tdr_metrics',
+    )
+
+
+def tdr_metrics_path(
+    reference_genome: ReferenceGenome,
+    dataset_type: DatasetType,
+    bq_table_name: str,
+) -> str:
+    return os.path.join(
+        tdr_metrics_dir(reference_genome, dataset_type),
+        f'{bq_table_name}.tsv',
     )
 
 
diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py
index f49e62b61..8c5f097f4 100644
--- a/v03_pipeline/lib/paths_test.py
+++ b/v03_pipeline/lib/paths_test.py
@@ -9,7 +9,6 @@
 from v03_pipeline.lib.paths import (
     family_table_path,
     imported_callset_path,
-    imputed_sex_path,
     lookup_table_path,
     metadata_for_run_path,
     new_variants_table_path,
@@ -19,6 +18,7 @@
     relatedness_check_table_path,
     remapped_and_subsetted_callset_path,
     sex_check_table_path,
+    tdr_metrics_path,
     valid_filters_path,
     validation_errors_for_run_path,
     variant_annotations_table_path,
@@ -177,14 +177,14 @@ def test_imported_callset_path(self) -> None:
             '/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
         )
 
-    def test_imputed_sex_path(self) -> None:
+    def test_tdr_metrics_path(self) -> None:
         self.assertEqual(
-            imputed_sex_path(
+            tdr_metrics_path(
                 ReferenceGenome.GRCh38,
                 DatasetType.SNV_INDEL,
-                'gs://abc.efg/callset.vcf.gz',
+                'datarepo-7242affb.datarepo_RP_3053',
             ),
-            '/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/imputed_sex/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.tsv',
+            '/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/tdr_metrics/datarepo-7242affb.datarepo_RP_3053.tsv',
         )
 
     def test_new_variants_table_path(self) -> None:
diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py
index 7c79b00d6..48a70ec6f 100644
--- a/v03_pipeline/lib/tasks/base/base_loading_run_params.py
+++ b/v03_pipeline/lib/tasks/base/base_loading_run_params.py
@@ -34,6 +34,10 @@ class BaseLoadingRunParams(luigi.Task):
         default=False,
         parsing=luigi.BoolParameter.EXPLICIT_PARSING,
     )
+    skip_expect_tdr_metrics = luigi.BoolParameter(
+        default=False,
+        parsing=luigi.BoolParameter.EXPLICIT_PARSING,
+    )
     skip_validation = luigi.BoolParameter(
         default=False,
         parsing=luigi.BoolParameter.EXPLICIT_PARSING,
diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py
index 90bba9ebf..9caa71e58 100644
--- a/v03_pipeline/lib/tasks/write_imported_callset.py
+++ b/v03_pipeline/lib/tasks/write_imported_callset.py
@@ -22,6 +22,7 @@
 from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
 from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
 from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget
+from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask
 from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
     WriteValidationErrorsForRunTask,
 )
@@ -60,6 +61,17 @@ def requires(self) -> list[luigi.Task]:
                     ),
                 ),
             ]
+        if (
+            Env.EXPECT_TDR_METRICS
+            and not self.skip_expect_tdr_metrics
+            and self.dataset_type.expect_tdr_metrics(
+                self.reference_genome,
+            )
+        ):
+            requirements = [
+                *requirements,
+                self.clone(WriteTDRMetricsFilesTask),
+            ]
         return [
             *requirements,
             CallsetTask(self.callset_path),
diff --git a/v03_pipeline/lib/tasks/write_sex_check_table.py b/v03_pipeline/lib/tasks/write_sex_check_table.py
index b8b4bb9e7..f87233507 100644
--- a/v03_pipeline/lib/tasks/write_sex_check_table.py
+++ b/v03_pipeline/lib/tasks/write_sex_check_table.py
@@ -1,10 +1,12 @@
 import hail as hl
+import hailtop.fs as hfs
 import luigi
 
 from v03_pipeline.lib.misc.io import import_imputed_sex
-from v03_pipeline.lib.paths import imputed_sex_path, sex_check_table_path
+from v03_pipeline.lib.paths import sex_check_table_path, tdr_metrics_dir
 from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
-from v03_pipeline.lib.tasks.files import GCSorLocalTarget, RawFileTask
+from v03_pipeline.lib.tasks.files import GCSorLocalTarget
+from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask
 
 
 class WriteSexCheckTableTask(BaseWriteTask):
@@ -20,13 +22,15 @@ def output(self) -> luigi.Target:
         )
 
     def requires(self) -> luigi.Task:
-        return RawFileTask(
-            imputed_sex_path(
-                self.reference_genome,
-                self.dataset_type,
-                self.callset_path,
-            ),
-        )
+        return self.clone(WriteTDRMetricsFilesTask)
 
     def create_table(self) -> hl.Table:
-        return import_imputed_sex(self.input().path)
+        ht = None
+        for tdr_metrics_file in hfs.ls(
+            tdr_metrics_dir(self.reference_genome, self.dataset_type),
+        ):
+            if not ht:
+                ht = import_imputed_sex(tdr_metrics_file.path)
+                continue
+            ht = ht.union(import_imputed_sex(tdr_metrics_file.path))
+        return ht
diff --git a/v03_pipeline/lib/tasks/write_sex_check_table_test.py b/v03_pipeline/lib/tasks/write_sex_check_table_test.py
new file mode 100644
index 000000000..2c935f44c
--- /dev/null
+++ b/v03_pipeline/lib/tasks/write_sex_check_table_test.py
@@ -0,0 +1,96 @@
+from unittest.mock import Mock, patch
+
+import google.cloud.bigquery
+import hail as hl
+import luigi.worker
+
+from v03_pipeline.lib.model import DatasetType, ReferenceGenome
+from v03_pipeline.lib.paths import sex_check_table_path, tdr_metrics_path
+from v03_pipeline.lib.tasks.write_sex_check_table import (
+    WriteSexCheckTableTask,
+)
+from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
+
+
+class WriteSexCheckTableTaskTest(MockedDatarootTestCase):
+    @patch('v03_pipeline.lib.tasks.write_tdr_metrics_files.gen_bq_table_names')
+    @patch('v03_pipeline.lib.tasks.write_tdr_metrics_file.bq_metrics_query')
+    def test_snv_sex_check_table_task(
+        self,
+        mock_bq_metrics_query: Mock,
+        mock_gen_bq_table_names: Mock,
+    ) -> None:
+        mock_gen_bq_table_names.return_value = [
+            'datarepo-7242affb.datarepo_RP_3053',
+            'datarepo-5a72e31b.datarepo_RP_3056',
+        ]
+        mock_bq_metrics_query.side_effect = [
+            iter(
+                [
+                    google.cloud.bigquery.table.Row(
+                        ('SM-NJ8MF', 'Unknown'),
+                        {'collaborator_sample_id': 0, 'predicted_sex': 1},
+                    ),
+                    google.cloud.bigquery.table.Row(
+                        ('SM-MWOGC', 'Female'),
+                        {'collaborator_sample_id': 0, 'predicted_sex': 1},
+                    ),
+                    google.cloud.bigquery.table.Row(
+                        ('SM-MWKWL', 'Male'),
+                        {'collaborator_sample_id': 0, 'predicted_sex': 1},
+                    ),
+                ],
+            ),
+            iter(
+                [
+                    google.cloud.bigquery.table.Row(
+                        ('SM-NGE65', 'Male'),
+                        {'collaborator_sample_id': 0, 'predicted_sex': 1},
+                    ),
+                    google.cloud.bigquery.table.Row(
+                        ('SM-NGE5G', 'Male'),
+                        {'collaborator_sample_id': 0, 'predicted_sex': 1},
+                    ),
+                    google.cloud.bigquery.table.Row(
+                        ('SM-NC6LM', 'Male'),
+                        {'collaborator_sample_id': 0, 'predicted_sex': 1},
+                    ),
+                ],
+            ),
+        ]
+        worker = luigi.worker.Worker()
+        write_sex_check_table = WriteSexCheckTableTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SNV_INDEL,
+            callset_path='na',
+        )
+        worker.add(write_sex_check_table)
+        worker.run()
+        self.assertTrue(write_sex_check_table.complete())
+        sex_check_ht = hl.read_table(
+            sex_check_table_path(
+                ReferenceGenome.GRCh38,
+                DatasetType.SNV_INDEL,
+                'na',
+            ),
+        )
+        self.assertEqual(
+            sex_check_ht.collect(),
+            [
+                hl.Struct(s='SM-MWKWL', predicted_sex='M'),
+                hl.Struct(s='SM-MWOGC', predicted_sex='F'),
+                hl.Struct(s='SM-NC6LM', predicted_sex='M'),
+                hl.Struct(s='SM-NGE5G', predicted_sex='M'),
+                hl.Struct(s='SM-NGE65', predicted_sex='M'),
+                hl.Struct(s='SM-NJ8MF', predicted_sex='U'),
+            ],
+        )
+        # Check underlying tdr metrics file.
+        with open(
+            tdr_metrics_path(
+                ReferenceGenome.GRCh38,
+                DatasetType.SNV_INDEL,
+                'datarepo-5a72e31b.datarepo_RP_3056',
+            ),
+        ) as f:
+            self.assertTrue('collaborator_sample_id' in f.read())
diff --git a/v03_pipeline/lib/tasks/write_tdr_metrics_file.py b/v03_pipeline/lib/tasks/write_tdr_metrics_file.py
new file mode 100644
index 000000000..b1c7ad9c0
--- /dev/null
+++ b/v03_pipeline/lib/tasks/write_tdr_metrics_file.py
@@ -0,0 +1,35 @@
+import csv
+
+import luigi
+import luigi.util
+
+from v03_pipeline.lib.misc.terra_data_repository import (
+    BIGQUERY_METRICS,
+    bq_metrics_query,
+)
+from v03_pipeline.lib.paths import tdr_metrics_path
+from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import (
+    BaseLoadingPipelineParams,
+)
+from v03_pipeline.lib.tasks.files import GCSorLocalTarget
+
+
+@luigi.util.inherits(BaseLoadingPipelineParams)
+class WriteTDRMetricsFileTask(luigi.Task):
+    bq_table_name = luigi.Parameter()
+
+    def output(self) -> luigi.Target:
+        return GCSorLocalTarget(
+            tdr_metrics_path(
+                self.reference_genome,
+                self.dataset_type,
+                self.bq_table_name,
+            ),
+        )
+
+    def run(self):
+        with self.output().open('w') as f:
+            writer = csv.DictWriter(f, fieldnames=BIGQUERY_METRICS, delimiter='\t')
+            writer.writeheader()
+            for row in bq_metrics_query(self.bq_table_name):
+                writer.writerow(row)
diff --git a/v03_pipeline/lib/tasks/write_tdr_metrics_files.py b/v03_pipeline/lib/tasks/write_tdr_metrics_files.py
new file mode 100644
index 000000000..c2f0d773e
--- /dev/null
+++ b/v03_pipeline/lib/tasks/write_tdr_metrics_files.py
@@ -0,0 +1,28 @@
+import luigi
+import luigi.util
+
+from v03_pipeline.lib.misc.terra_data_repository import gen_bq_table_names
+from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import (
+    BaseLoadingPipelineParams,
+)
+from v03_pipeline.lib.tasks.write_tdr_metrics_file import WriteTDRMetricsFileTask
+
+
+@luigi.util.inherits(BaseLoadingPipelineParams)
+class WriteTDRMetricsFilesTask(luigi.Task):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.dynamic_write_tdr_metrics_file_task = set()
+
+    def complete(self) -> bool:
+        return len(self.dynamic_write_tdr_metrics_file_task) >= 1 and all(
+            write_tdr_metrics_file_task.complete()
+            for write_tdr_metrics_file_task in self.dynamic_write_tdr_metrics_file_task
+        )
+
+    def run(self):
+        for bq_table_name in gen_bq_table_names():
+            self.dynamic_write_tdr_metrics_file_task.add(
+                self.clone(WriteTDRMetricsFileTask, bq_table_name=bq_table_name),
+            )
+        yield self.dynamic_write_tdr_metrics_file_task

From 0eef8976a97d36570922fbf146533e2e77a2ef60 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <b.p.blankenmeister@gmail.com>
Date: Fri, 13 Dec 2024 17:20:58 -0500
Subject: [PATCH 04/10] refactor: Move feature flags to FeatureFlag enum.
 (#1002)

* refactor: Move feature flags out of environment to their own dataclass

* lint: ruff

* ruff
---
 v03_pipeline/bin/pipeline_worker.py           |  4 +--
 v03_pipeline/lib/model/__init__.py            |  2 ++
 v03_pipeline/lib/model/environment.py         | 20 --------------
 v03_pipeline/lib/model/feature_flag.py        | 26 +++++++++++++++++++
 v03_pipeline/lib/paths.py                     |  9 ++++---
 v03_pipeline/lib/paths_test.py                | 10 ++++---
 .../reference_datasets/reference_dataset.py   |  9 +++++--
 .../tasks/dataproc/create_dataproc_cluster.py | 13 ++++++----
 ...annotations_table_with_new_samples_test.py |  6 ++---
 v03_pipeline/lib/tasks/validate_callset.py    |  6 ++---
 .../lib/tasks/write_imported_callset.py       |  8 +++---
 .../write_remapped_and_subsetted_callset.py   |  6 ++---
 ...ite_remapped_and_subsetted_callset_test.py |  6 ++---
 13 files changed, 72 insertions(+), 53 deletions(-)
 create mode 100644 v03_pipeline/lib/model/feature_flag.py

diff --git a/v03_pipeline/bin/pipeline_worker.py b/v03_pipeline/bin/pipeline_worker.py
index 7586fe8f6..7be97a9b0 100755
--- a/v03_pipeline/bin/pipeline_worker.py
+++ b/v03_pipeline/bin/pipeline_worker.py
@@ -7,7 +7,7 @@
 
 from v03_pipeline.api.model import LoadingPipelineRequest
 from v03_pipeline.lib.logger import get_logger
-from v03_pipeline.lib.model import Env
+from v03_pipeline.lib.model import FeatureFlag
 from v03_pipeline.lib.paths import (
     loading_pipeline_queue_path,
     project_pedigree_path,
@@ -54,7 +54,7 @@ def main():
                 'run_id': run_id,
                 **{k: v for k, v in lpr.model_dump().items() if k != 'projects_to_run'},
             }
-            if Env.SHOULD_TRIGGER_HAIL_BACKEND_RELOAD:
+            if FeatureFlag.SHOULD_TRIGGER_HAIL_BACKEND_RELOAD:
                 tasks = [
                     TriggerHailBackendReload(**loading_run_task_params),
                 ]
diff --git a/v03_pipeline/lib/model/__init__.py b/v03_pipeline/lib/model/__init__.py
index bb4325d47..45847f4f5 100644
--- a/v03_pipeline/lib/model/__init__.py
+++ b/v03_pipeline/lib/model/__init__.py
@@ -7,11 +7,13 @@
     Sex,
 )
 from v03_pipeline.lib.model.environment import Env
+from v03_pipeline.lib.model.feature_flag import FeatureFlag
 
 __all__ = [
     'AccessControl',
     'DatasetType',
     'Env',
+    'FeatureFlag',
     'Sex',
     'PipelineVersion',
     'ReferenceGenome',
diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py
index e307b0226..42ae4c658 100644
--- a/v03_pipeline/lib/model/environment.py
+++ b/v03_pipeline/lib/model/environment.py
@@ -51,30 +51,12 @@
 GCLOUD_REGION = os.environ.get('GCLOUD_REGION')
 PIPELINE_RUNNER_APP_VERSION = os.environ.get('PIPELINE_RUNNER_APP_VERSION', 'latest')
 
-# Feature Flags
-ACCESS_PRIVATE_REFERENCE_DATASETS = (
-    os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1'
-)
-CHECK_SEX_AND_RELATEDNESS = os.environ.get('CHECK_SEX_AND_RELATEDNESS') == '1'
-EXPECT_TDR_METRICS = os.environ.get('EXPECT_TDR_METRICS') == '1'
-EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1'
-INCLUDE_PIPELINE_VERSION_IN_PREFIX = (
-    os.environ.get('INCLUDE_PIPELINE_VERSION_IN_PREFIX') == '1'
-)
-SHOULD_TRIGGER_HAIL_BACKEND_RELOAD = (
-    os.environ.get('SHOULD_TRIGGER_HAIL_BACKEND_RELOAD') == '1'
-)
-
 
 @dataclass
 class Env:
-    ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS
-    CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS
     CLINGEN_ALLELE_REGISTRY_LOGIN: str | None = CLINGEN_ALLELE_REGISTRY_LOGIN
     CLINGEN_ALLELE_REGISTRY_PASSWORD: str | None = CLINGEN_ALLELE_REGISTRY_PASSWORD
     DEPLOYMENT_TYPE: Literal['dev', 'prod'] = DEPLOYMENT_TYPE
-    EXPECT_TDR_METRICS: bool = EXPECT_TDR_METRICS
-    EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS
     GCLOUD_DATAPROC_SECONDARY_WORKERS: str = GCLOUD_DATAPROC_SECONDARY_WORKERS
     GCLOUD_PROJECT: str | None = GCLOUD_PROJECT
     GCLOUD_ZONE: str | None = GCLOUD_ZONE
@@ -85,10 +67,8 @@ class Env:
     HAIL_BACKEND_SERVICE_PORT: int = HAIL_BACKEND_SERVICE_PORT
     HAIL_TMP_DIR: str = HAIL_TMP_DIR
     HAIL_SEARCH_DATA_DIR: str = HAIL_SEARCH_DATA_DIR
-    INCLUDE_PIPELINE_VERSION_IN_PREFIX: bool = INCLUDE_PIPELINE_VERSION_IN_PREFIX
     LOADING_DATASETS_DIR: str = LOADING_DATASETS_DIR
     PIPELINE_RUNNER_APP_VERSION: str = PIPELINE_RUNNER_APP_VERSION
     PRIVATE_REFERENCE_DATASETS_DIR: str = PRIVATE_REFERENCE_DATASETS_DIR
     REFERENCE_DATASETS_DIR: str = REFERENCE_DATASETS_DIR
-    SHOULD_TRIGGER_HAIL_BACKEND_RELOAD: bool = SHOULD_TRIGGER_HAIL_BACKEND_RELOAD
     VEP_REFERENCE_DATASETS_DIR: str = VEP_REFERENCE_DATASETS_DIR
diff --git a/v03_pipeline/lib/model/feature_flag.py b/v03_pipeline/lib/model/feature_flag.py
new file mode 100644
index 000000000..0b57e8643
--- /dev/null
+++ b/v03_pipeline/lib/model/feature_flag.py
@@ -0,0 +1,26 @@
+import os
+from dataclasses import dataclass
+
+# Feature Flags
+ACCESS_PRIVATE_REFERENCE_DATASETS = (
+    os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1'
+)
+CHECK_SEX_AND_RELATEDNESS = os.environ.get('CHECK_SEX_AND_RELATEDNESS') == '1'
+EXPECT_TDR_METRICS = os.environ.get('EXPECT_TDR_METRICS') == '1'
+EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1'
+INCLUDE_PIPELINE_VERSION_IN_PREFIX = (
+    os.environ.get('INCLUDE_PIPELINE_VERSION_IN_PREFIX') == '1'
+)
+SHOULD_TRIGGER_HAIL_BACKEND_RELOAD = (
+    os.environ.get('SHOULD_TRIGGER_HAIL_BACKEND_RELOAD') == '1'
+)
+
+
+@dataclass
+class FeatureFlag:
+    ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS
+    CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS
+    EXPECT_TDR_METRICS: bool = EXPECT_TDR_METRICS
+    EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS
+    INCLUDE_PIPELINE_VERSION_IN_PREFIX: bool = INCLUDE_PIPELINE_VERSION_IN_PREFIX
+    SHOULD_TRIGGER_HAIL_BACKEND_RELOAD: bool = SHOULD_TRIGGER_HAIL_BACKEND_RELOAD
diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py
index 5a54880fd..2942b1dae 100644
--- a/v03_pipeline/lib/paths.py
+++ b/v03_pipeline/lib/paths.py
@@ -6,6 +6,7 @@
     AccessControl,
     DatasetType,
     Env,
+    FeatureFlag,
     PipelineVersion,
     ReferenceGenome,
     SampleType,
@@ -21,7 +22,7 @@ def _pipeline_prefix(
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,
 ) -> str:
-    if Env.INCLUDE_PIPELINE_VERSION_IN_PREFIX:
+    if FeatureFlag.INCLUDE_PIPELINE_VERSION_IN_PREFIX:
         return os.path.join(
             root,
             PipelineVersion.V3_1.value,
@@ -45,7 +46,7 @@ def _v03_reference_data_prefix(
         if access_control == AccessControl.PRIVATE
         else Env.REFERENCE_DATASETS_DIR
     )
-    if Env.INCLUDE_PIPELINE_VERSION_IN_PREFIX:
+    if FeatureFlag.INCLUDE_PIPELINE_VERSION_IN_PREFIX:
         return os.path.join(
             root,
             PipelineVersion.V03.value,
@@ -68,7 +69,7 @@ def _v03_reference_dataset_prefix(
         if access_control == AccessControl.PRIVATE
         else Env.REFERENCE_DATASETS_DIR
     )
-    if Env.INCLUDE_PIPELINE_VERSION_IN_PREFIX:
+    if FeatureFlag.INCLUDE_PIPELINE_VERSION_IN_PREFIX:
         return os.path.join(
             root,
             PipelineVersion.V3_1.value,
@@ -287,7 +288,7 @@ def valid_filters_path(
     callset_path: str,
 ) -> str | None:
     if (
-        not Env.EXPECT_WES_FILTERS
+        not FeatureFlag.EXPECT_WES_FILTERS
         or not dataset_type.expect_filters(sample_type)
         or 'part_one_outputs' not in callset_path
     ):
diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py
index 8c5f097f4..eac76232f 100644
--- a/v03_pipeline/lib/paths_test.py
+++ b/v03_pipeline/lib/paths_test.py
@@ -36,7 +36,9 @@ def test_family_table_path(self) -> None:
             ),
             '/var/seqr/seqr-hail-search-data/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht',
         )
-        with patch('v03_pipeline.lib.paths.Env') as mock_env:
+        with patch('v03_pipeline.lib.paths.Env') as mock_env, patch(
+            'v03_pipeline.lib.paths.FeatureFlag',
+        ) as mock_ff:
             mock_env.HAIL_SEARCH_DATA_DIR = 'gs://seqr-datasets/'
             self.assertEqual(
                 family_table_path(
@@ -47,7 +49,7 @@ def test_family_table_path(self) -> None:
                 ),
                 'gs://seqr-datasets/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht',
             )
-            mock_env.INCLUDE_PIPELINE_VERSION_IN_PREFIX = False
+            mock_ff.INCLUDE_PIPELINE_VERSION_IN_PREFIX = False
             self.assertEqual(
                 family_table_path(
                     ReferenceGenome.GRCh37,
@@ -67,8 +69,8 @@ def test_valid_filters_path(self) -> None:
             ),
             None,
         )
-        with patch('v03_pipeline.lib.paths.Env') as mock_env:
-            mock_env.EXPECT_WES_FILTERS = True
+        with patch('v03_pipeline.lib.paths.FeatureFlag') as mock_ff:
+            mock_ff.EXPECT_WES_FILTERS = True
             self.assertEqual(
                 valid_filters_path(
                     DatasetType.SNV_INDEL,
diff --git a/v03_pipeline/lib/reference_datasets/reference_dataset.py b/v03_pipeline/lib/reference_datasets/reference_dataset.py
index 47480e723..445e191aa 100644
--- a/v03_pipeline/lib/reference_datasets/reference_dataset.py
+++ b/v03_pipeline/lib/reference_datasets/reference_dataset.py
@@ -10,7 +10,12 @@
     validate_allele_type,
     validate_no_duplicate_variants,
 )
-from v03_pipeline.lib.model import AccessControl, DatasetType, Env, ReferenceGenome
+from v03_pipeline.lib.model import (
+    AccessControl,
+    DatasetType,
+    FeatureFlag,
+    ReferenceGenome,
+)
 from v03_pipeline.lib.reference_datasets import clinvar, dbnsfp
 from v03_pipeline.lib.reference_datasets.misc import (
     compress_floats,
@@ -41,7 +46,7 @@ def for_reference_genome_dataset_type(
             for dataset, config in CONFIG.items()
             if dataset_type in config.get(reference_genome, {}).get(DATASET_TYPES, [])
         ]
-        if not Env.ACCESS_PRIVATE_REFERENCE_DATASETS:
+        if not FeatureFlag.ACCESS_PRIVATE_REFERENCE_DATASETS:
             return {
                 dataset
                 for dataset in reference_datasets
diff --git a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py
index f7d0bc21d..27693c5d3 100644
--- a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py
+++ b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py
@@ -7,7 +7,7 @@
 
 from v03_pipeline.lib.logger import get_logger
 from v03_pipeline.lib.misc.gcp import get_service_account_credentials
-from v03_pipeline.lib.model import Env, ReferenceGenome
+from v03_pipeline.lib.model import Env, FeatureFlag, ReferenceGenome
 from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import (
     BaseLoadingPipelineParams,
 )
@@ -88,18 +88,21 @@ def get_cluster_config(reference_genome: ReferenceGenome, run_id: str):
                     'spark:spark.executorEnv.HAIL_WORKER_OFF_HEAP_MEMORY_PER_CORE_MB': '6323',
                     'spark:spark.speculation': 'true',
                     'spark-env:ACCESS_PRIVATE_REFERENCE_DATASETS': '1'
-                    if Env.ACCESS_PRIVATE_REFERENCE_DATASETS
+                    if FeatureFlag.ACCESS_PRIVATE_REFERENCE_DATASETS
                     else '0',
                     'spark-env:CHECK_SEX_AND_RELATEDNESS': '1'
-                    if Env.CHECK_SEX_AND_RELATEDNESS
+                    if FeatureFlag.CHECK_SEX_AND_RELATEDNESS
+                    else '0',
+                    'spark-env:EXPECT_TDR_METRICS': '1'
+                    if FeatureFlag.EXPECT_TDR_METRICS
                     else '0',
                     'spark-env:EXPECT_WES_FILTERS': '1'
-                    if Env.EXPECT_WES_FILTERS
+                    if FeatureFlag.EXPECT_WES_FILTERS
                     else '0',
                     'spark-env:HAIL_SEARCH_DATA_DIR': Env.HAIL_SEARCH_DATA_DIR,
                     'spark-env:HAIL_TMP_DIR': Env.HAIL_TMP_DIR,
                     'spark-env:INCLUDE_PIPELINE_VERSION_IN_PREFIX': '1'
-                    if Env.INCLUDE_PIPELINE_VERSION_IN_PREFIX
+                    if FeatureFlag.INCLUDE_PIPELINE_VERSION_IN_PREFIX
                     else '0',
                     'spark-env:LOADING_DATASETS_DIR': Env.LOADING_DATASETS_DIR,
                     'spark-env:PRIVATE_REFERENCE_DATASETS_DIR': Env.PRIVATE_REFERENCE_DATASETS_DIR,
diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
index d29b58ff6..47511c695 100644
--- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
+++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
@@ -712,7 +712,7 @@ def test_update_vat_grch37(
     @patch(
         'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset',
     )
-    @patch('v03_pipeline.lib.reference_datasets.reference_dataset.Env')
+    @patch('v03_pipeline.lib.reference_datasets.reference_dataset.FeatureFlag')
     @patch('v03_pipeline.lib.vep.hl.vep')
     @patch(
         'v03_pipeline.lib.tasks.write_new_variants_table.load_gencode_ensembl_to_refseq_id',
@@ -721,7 +721,7 @@ def test_update_vat_without_accessing_private_datasets(
         self,
         mock_load_gencode_ensembl_to_refseq_id: Mock,
         mock_vep: Mock,
-        mock_rd_env: Mock,
+        mock_rd_ff: Mock,
         mock_update_vat_with_rd_task: Mock,
         mock_register_alleles: Mock,
     ) -> None:
@@ -740,7 +740,7 @@ def test_update_vat_without_accessing_private_datasets(
                 ReferenceDataset.hgmd,
             ),
         )
-        mock_rd_env.ACCESS_PRIVATE_REFERENCE_DATASETS = False
+        mock_rd_ff.ACCESS_PRIVATE_REFERENCE_DATASETS = False
         mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_38_VEP_DATA)
         mock_register_alleles.side_effect = None
 
diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py
index e5601875b..3d0d979db 100644
--- a/v03_pipeline/lib/tasks/validate_callset.py
+++ b/v03_pipeline/lib/tasks/validate_callset.py
@@ -10,7 +10,7 @@
     validate_no_duplicate_variants,
     validate_sample_type,
 )
-from v03_pipeline.lib.model.environment import Env
+from v03_pipeline.lib.model.feature_flag import FeatureFlag
 from v03_pipeline.lib.paths import (
     imported_callset_path,
     sex_check_table_path,
@@ -41,7 +41,7 @@ def get_validation_dependencies(self) -> dict[str, hl.Table]:
             ),
         )
         if (
-            Env.CHECK_SEX_AND_RELATEDNESS
+            FeatureFlag.CHECK_SEX_AND_RELATEDNESS
             and self.dataset_type.check_sex_and_relatedness
             and not self.skip_check_sex_and_relatedness
         ):
@@ -86,7 +86,7 @@ def requires(self) -> list[luigi.Task]:
                 ),
             ]
         if (
-            Env.CHECK_SEX_AND_RELATEDNESS
+            FeatureFlag.CHECK_SEX_AND_RELATEDNESS
             and self.dataset_type.check_sex_and_relatedness
             and not self.skip_check_sex_and_relatedness
         ):
diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py
index 9caa71e58..cc827df52 100644
--- a/v03_pipeline/lib/tasks/write_imported_callset.py
+++ b/v03_pipeline/lib/tasks/write_imported_callset.py
@@ -14,7 +14,7 @@
     validate_imported_field_types,
 )
 from v03_pipeline.lib.misc.vets import annotate_vets
-from v03_pipeline.lib.model.environment import Env
+from v03_pipeline.lib.model.feature_flag import FeatureFlag
 from v03_pipeline.lib.paths import (
     imported_callset_path,
     valid_filters_path,
@@ -45,7 +45,7 @@ def output(self) -> luigi.Target:
     def requires(self) -> list[luigi.Task]:
         requirements = []
         if (
-            Env.EXPECT_WES_FILTERS
+            FeatureFlag.EXPECT_WES_FILTERS
             and not self.skip_expect_filters
             and self.dataset_type.expect_filters(
                 self.sample_type,
@@ -62,7 +62,7 @@ def requires(self) -> list[luigi.Task]:
                 ),
             ]
         if (
-            Env.EXPECT_TDR_METRICS
+            FeatureFlag.EXPECT_TDR_METRICS
             and not self.skip_expect_tdr_metrics
             and self.dataset_type.expect_tdr_metrics(
                 self.reference_genome,
@@ -87,7 +87,7 @@ def create_table(self) -> hl.MatrixTable:
             )
             filters_path = None
             if (
-                Env.EXPECT_WES_FILTERS
+                FeatureFlag.EXPECT_WES_FILTERS
                 and not self.skip_expect_filters
                 and self.dataset_type.expect_filters(
                     self.sample_type,
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
index f4c934662..41b4c148e 100644
--- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
+++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
@@ -16,7 +16,7 @@
 )
 from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families
 from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples
-from v03_pipeline.lib.model.environment import Env
+from v03_pipeline.lib.model.feature_flag import FeatureFlag
 from v03_pipeline.lib.paths import (
     relatedness_check_table_path,
     remapped_and_subsetted_callset_path,
@@ -62,7 +62,7 @@ def requires(self) -> list[luigi.Task]:
             RawFileTask(self.project_pedigree_paths[self.project_i]),
         ]
         if (
-            Env.CHECK_SEX_AND_RELATEDNESS
+            FeatureFlag.CHECK_SEX_AND_RELATEDNESS
             and self.dataset_type.check_sex_and_relatedness
             and not self.skip_check_sex_and_relatedness
         ):
@@ -98,7 +98,7 @@ def create_table(self) -> hl.MatrixTable:
         families_failed_relatedness_check = {}
         families_failed_sex_check = {}
         if (
-            Env.CHECK_SEX_AND_RELATEDNESS
+            FeatureFlag.CHECK_SEX_AND_RELATEDNESS
             and self.dataset_type.check_sex_and_relatedness
             and not self.skip_check_sex_and_relatedness
         ):
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
index 4a0c84660..5ac04c278 100644
--- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
+++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
@@ -115,12 +115,12 @@ def test_write_remapped_and_subsetted_callset_task(
             ],
         )
 
-    @patch('v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset.Env')
+    @patch('v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset.FeatureFlag')
     def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family(
         self,
-        mock_env: Mock,
+        mock_ff: Mock,
     ) -> None:
-        mock_env.CHECK_SEX_AND_RELATEDNESS = True
+        mock_ff.CHECK_SEX_AND_RELATEDNESS = True
         worker = luigi.worker.Worker()
         wrsc_task = WriteRemappedAndSubsettedCallsetTask(
             reference_genome=ReferenceGenome.GRCh38,

From 2e8dbcfef31ee03a43e157d915287f6a7cf7349e Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <b.p.blankenmeister@gmail.com>
Date: Wed, 18 Dec 2024 14:30:22 -0800
Subject: [PATCH 05/10] bugfix: exclude samples from relationship checking that
 are not present in the expected loadable samples (#1003)

* bugfix: exclude samples from relationship checking that are not present in the expected loadable samples

* cleanup
---
 .../lib/misc/family_loading_failures.py       | 115 +++++++-----------
 .../lib/misc/family_loading_failures_test.py  |  50 +++++++-
 v03_pipeline/lib/misc/pedigree.py             |   8 +-
 3 files changed, 97 insertions(+), 76 deletions(-)

diff --git a/v03_pipeline/lib/misc/family_loading_failures.py b/v03_pipeline/lib/misc/family_loading_failures.py
index e3b1b59db..438b1706e 100644
--- a/v03_pipeline/lib/misc/family_loading_failures.py
+++ b/v03_pipeline/lib/misc/family_loading_failures.py
@@ -16,7 +16,8 @@ def passes_relatedness_check(
     relatedness_check_lookup: dict[tuple[str, str], list],
     sample_id: str,
     other_id: str,
-    relation: Relation,
+    expected_relation: Relation,
+    additional_allowed_relation: Relation | None,
 ) -> tuple[bool, str | None]:
     # No relationship to check, return true
     if other_id is None:
@@ -24,85 +25,62 @@ def passes_relatedness_check(
     coefficients = relatedness_check_lookup.get(
         (min(sample_id, other_id), max(sample_id, other_id)),
     )
-    if not coefficients or not np.allclose(
-        coefficients,
-        relation.coefficients,
-        atol=RELATEDNESS_TOLERANCE,
+    if not coefficients or not any(
+        np.allclose(
+            coefficients,
+            relation.coefficients,
+            atol=RELATEDNESS_TOLERANCE,
+        )
+        for relation in (
+            [expected_relation, additional_allowed_relation]
+            if additional_allowed_relation
+            else [expected_relation]
+        )
     ):
         return (
             False,
-            f'Sample {sample_id} has expected relation "{relation.value}" to {other_id} but has coefficients {coefficients or []}',
+            f'Sample {sample_id} has expected relation "{expected_relation.value}" to {other_id} but has coefficients {coefficients or []}',
         )
     return True, None
 
 
-def all_relatedness_checks(  # noqa: C901
+def all_relatedness_checks(
     relatedness_check_lookup: dict[tuple[str, str], list],
+    family: Family,
     sample: Sample,
 ) -> list[str]:
     failure_reasons = []
-    for parent_id in [sample.mother, sample.father]:
-        success, reason = passes_relatedness_check(
-            relatedness_check_lookup,
-            sample.sample_id,
-            parent_id,
-            Relation.PARENT,
-        )
-        if not success:
-            failure_reasons.append(reason)
-
-    for grandparent_id in [
-        sample.maternal_grandmother,
-        sample.maternal_grandfather,
-        sample.paternal_grandmother,
-        sample.paternal_grandfather,
+    for relationship_set, relation, additional_allowed_relation in [
+        ([sample.mother, sample.father], Relation.PARENT_CHILD, None),
+        (
+            [
+                sample.maternal_grandmother,
+                sample.maternal_grandfather,
+                sample.paternal_grandmother,
+                sample.paternal_grandfather,
+            ],
+            Relation.GRANDPARENT_GRANDCHILD,
+            None,
+        ),
+        (sample.siblings, Relation.SIBLING, None),
+        (sample.half_siblings, Relation.HALF_SIBLING, Relation.SIBLING),
+        (sample.aunt_nephews, Relation.AUNT_NEPHEW, None),
     ]:
-        success, reason = passes_relatedness_check(
-            relatedness_check_lookup,
-            sample.sample_id,
-            grandparent_id,
-            Relation.GRANDPARENT,
-        )
-        if not success:
-            failure_reasons.append(reason)
-
-    for sibling_id in sample.siblings:
-        success, reason = passes_relatedness_check(
-            relatedness_check_lookup,
-            sample.sample_id,
-            sibling_id,
-            Relation.SIBLING,
-        )
-        if not success:
-            failure_reasons.append(reason)
-
-    for half_sibling_id in sample.half_siblings:
-        # NB: A "half sibling" parsed from the pedigree may actually be a sibling, so we allow those
-        # through as well.
-        success1, _ = passes_relatedness_check(
-            relatedness_check_lookup,
-            sample.sample_id,
-            half_sibling_id,
-            Relation.SIBLING,
-        )
-        success2, reason = passes_relatedness_check(
-            relatedness_check_lookup,
-            sample.sample_id,
-            half_sibling_id,
-            Relation.HALF_SIBLING,
-        )
-        if not success1 and not success2:
-            failure_reasons.append(reason)
-
-    for aunt_nephew_id in sample.aunt_nephews:
-        success, reason = passes_relatedness_check(
-            relatedness_check_lookup,
-            sample.sample_id,
-            aunt_nephew_id,
-            Relation.AUNT_NEPHEW,
-        )
-        if not success:
-            failure_reasons.append(reason)
+        for other_id in relationship_set:
+            # Handle case where relation is identified in the
+            # pedigree as a "dummy" but is not included in
+            # the list of samples to load.
+            if other_id not in family.samples:
+                continue
+            success, reason = passes_relatedness_check(
+                relatedness_check_lookup,
+                sample.sample_id,
+                other_id,
+                relation,
+                additional_allowed_relation,
+            )
+            if not success:
+                failure_reasons.append(reason)
     return failure_reasons
 
 
@@ -162,6 +140,7 @@ def get_families_failed_relatedness_check(
         for sample in family.samples.values():
             failure_reasons = all_relatedness_checks(
                 relatedness_check_lookup,
+                family,
                 sample,
             )
             if failure_reasons:
diff --git a/v03_pipeline/lib/misc/family_loading_failures_test.py b/v03_pipeline/lib/misc/family_loading_failures_test.py
index bcd783f5f..c9b1858b2 100644
--- a/v03_pipeline/lib/misc/family_loading_failures_test.py
+++ b/v03_pipeline/lib/misc/family_loading_failures_test.py
@@ -9,7 +9,7 @@
     get_families_failed_sex_check,
 )
 from v03_pipeline.lib.misc.io import import_pedigree
-from v03_pipeline.lib.misc.pedigree import Sample, parse_pedigree_ht_to_families
+from v03_pipeline.lib.misc.pedigree import Family, Sample, parse_pedigree_ht_to_families
 from v03_pipeline.lib.model import Sex
 
 TEST_PEDIGREE_6 = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'
@@ -104,7 +104,21 @@ def test_all_relatedness_checks(self):
             paternal_grandfather='sample_3',
             half_siblings=['sample_4'],
         )
-        failure_reasons = all_relatedness_checks(relatedness_check_lookup, sample)
+        family = Family(
+            family_guid='family_1a',
+            samples={
+                'sample_1': sample,
+                'sample_2': Sample(sex=Sex.MALE, sample_id='sample_2'),
+                'sample_3': Sample(sex=Sex.MALE, sample_id='sample_3'),
+                'sample_4': Sample(sex=Sex.MALE, sample_id='sample_4'),
+                'sample_5': Sample(sex=Sex.MALE, sample_id='sample_5'),
+            },
+        )
+        failure_reasons = all_relatedness_checks(
+            relatedness_check_lookup,
+            family,
+            sample,
+        )
         self.assertListEqual(failure_reasons, [])
 
         # Defined grandparent missing in relatedness table
@@ -117,12 +131,13 @@ def test_all_relatedness_checks(self):
         )
         failure_reasons = all_relatedness_checks(
             relatedness_check_lookup,
+            family,
             sample,
         )
         self.assertListEqual(
             failure_reasons,
             [
-                'Sample sample_1 has expected relation "grandparent" to sample_5 but has coefficients []',
+                'Sample sample_1 has expected relation "grandparent_grandchild" to sample_5 but has coefficients []',
             ],
         )
 
@@ -140,6 +155,7 @@ def test_all_relatedness_checks(self):
         )
         failure_reasons = all_relatedness_checks(
             relatedness_check_lookup,
+            family,
             sample,
         )
         self.assertListEqual(
@@ -167,16 +183,42 @@ def test_all_relatedness_checks(self):
         )
         failure_reasons = all_relatedness_checks(
             relatedness_check_lookup,
+            family,
             sample,
         )
         self.assertListEqual(
             failure_reasons,
             [
-                'Sample sample_1 has expected relation "parent" to sample_2 but has coefficients [0.5, 0.5, 0.5, 0.5]',
+                'Sample sample_1 has expected relation "parent_child" to sample_2 but has coefficients [0.5, 0.5, 0.5, 0.5]',
                 'Sample sample_1 has expected relation "sibling" to sample_4 but has coefficients [0.5, 0.5, 0, 0.25]',
             ],
         )
 
+        # Some samples will include relationships with
+        # samples that are not expected to be included
+        # in the callset.  These should not trigger relatedness
+        # failures.
+        sample = Sample(
+            sex=Sex.FEMALE,
+            sample_id='sample_1',
+            mother='sample_2',
+        )
+        family = Family(
+            family_guid='family_1a',
+            samples={
+                'sample_1': sample,
+            },
+        )
+        failure_reasons = all_relatedness_checks(
+            {},
+            family,
+            sample,
+        )
+        self.assertListEqual(
+            failure_reasons,
+            [],
+        )
+
     def test_get_families_failed_sex_check(self):
         sex_check_ht = hl.Table.parallelize(
             [
diff --git a/v03_pipeline/lib/misc/pedigree.py b/v03_pipeline/lib/misc/pedigree.py
index ee2d86521..2e17f27f0 100644
--- a/v03_pipeline/lib/misc/pedigree.py
+++ b/v03_pipeline/lib/misc/pedigree.py
@@ -8,8 +8,8 @@
 
 
 class Relation(Enum):
-    PARENT = 'parent'
-    GRANDPARENT = 'grandparent'
+    PARENT_CHILD = 'parent_child'
+    GRANDPARENT_GRANDCHILD = 'grandparent_grandchild'
     SIBLING = 'sibling'
     HALF_SIBLING = 'half_sibling'
     AUNT_NEPHEW = 'aunt_nephew'
@@ -17,8 +17,8 @@ class Relation(Enum):
     @property
     def coefficients(self):
         return {
-            Relation.PARENT: [0, 1, 0, 0.5],
-            Relation.GRANDPARENT: [0.5, 0.5, 0, 0.25],
+            Relation.PARENT_CHILD: [0, 1, 0, 0.5],
+            Relation.GRANDPARENT_GRANDCHILD: [0.5, 0.5, 0, 0.25],
             Relation.SIBLING: [0.25, 0.5, 0.25, 0.5],
             Relation.HALF_SIBLING: [0.5, 0.5, 0, 0.25],
             Relation.AUNT_NEPHEW: [0.5, 0.5, 0, 0.25],

From ddf867ac1edcb4a2aaff65532ae5d8a00a91288e Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <b.p.blankenmeister@gmail.com>
Date: Thu, 2 Jan 2025 10:48:11 -0500
Subject: [PATCH 06/10] =?UTF-8?q?feat:=20add=20remap=20and=20family=20load?=
 =?UTF-8?q?ing=20failures=20as=20validation=20exceptions=20=E2=80=A6=20(#1?=
 =?UTF-8?q?005)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: add remap and family loading failures as validation exceptions rather than runtime errors

* move on

* Update write_remapped_and_subsetted_callset_test.py

* ruff
---
 v03_pipeline/lib/misc/sample_ids.py           |  22 +--
 v03_pipeline/lib/misc/sample_ids_test.py      |  10 +-
 .../lib/tasks/validate_callset_test.py        |   1 +
 .../lib/tasks/write_imported_callset.py       | 108 ++++++------
 .../write_remapped_and_subsetted_callset.py   |  61 ++++---
 ...ite_remapped_and_subsetted_callset_test.py | 161 ++++++++++++++++++
 .../tasks/write_validation_errors_for_run.py  |  34 ++++
 .../var/test/pedigrees/test_pedigree_7.tsv    |  14 ++
 8 files changed, 299 insertions(+), 112 deletions(-)
 create mode 100644 v03_pipeline/var/test/pedigrees/test_pedigree_7.tsv

diff --git a/v03_pipeline/lib/misc/sample_ids.py b/v03_pipeline/lib/misc/sample_ids.py
index 77f173ffc..7a1c2bc0b 100644
--- a/v03_pipeline/lib/misc/sample_ids.py
+++ b/v03_pipeline/lib/misc/sample_ids.py
@@ -3,28 +3,16 @@
 import hail as hl
 
 from v03_pipeline.lib.logger import get_logger
+from v03_pipeline.lib.misc.validation import SeqrValidationError
 
 logger = get_logger(__name__)
 
 
-class MatrixTableSampleSetError(Exception):
-    def __init__(self, message, missing_samples):
-        super().__init__(message)
-        self.missing_samples = missing_samples
-
-
-def vcf_remap(mt: hl.MatrixTable) -> hl.MatrixTable:
-    # TODO: add logic from Mike to remap vcf samples delivered from Broad WGS
-    return mt
-
-
 def remap_sample_ids(
     mt: hl.MatrixTable,
     project_remap_ht: hl.Table,
     ignore_missing_samples_when_remapping: bool,
 ) -> hl.MatrixTable:
-    mt = vcf_remap(mt)
-
     collected_remap = project_remap_ht.collect()
     s_dups = [k for k, v in Counter([r.s for r in collected_remap]).items() if v > 1]
     seqr_dups = [
@@ -33,7 +21,7 @@ def remap_sample_ids(
 
     if len(s_dups) > 0 or len(seqr_dups) > 0:
         msg = f'Duplicate s or seqr_id entries in remap file were found. Duplicate s:{s_dups}. Duplicate seqr_id:{seqr_dups}.'
-        raise ValueError(msg)
+        raise SeqrValidationError(msg)
 
     missing_samples = project_remap_ht.anti_join(mt.cols()).collect()
     remap_count = len(collected_remap)
@@ -48,7 +36,7 @@ def remap_sample_ids(
         if ignore_missing_samples_when_remapping:
             logger.info(message)
         else:
-            raise MatrixTableSampleSetError(message, missing_samples)
+            raise SeqrValidationError(message)
 
     mt = mt.annotate_cols(**project_remap_ht[mt.s])
     remap_expr = hl.if_else(hl.is_missing(mt.seqr_id), mt.s, mt.seqr_id)
@@ -67,7 +55,7 @@ def subset_samples(
     anti_join_ht_count = anti_join_ht.count()
     if subset_count == 0:
         message = '0 sample ids found the subset HT, something is probably wrong.'
-        raise MatrixTableSampleSetError(message, [])
+        raise SeqrValidationError(message)
 
     if anti_join_ht_count != 0:
         missing_samples = anti_join_ht.s.collect()
@@ -77,7 +65,7 @@ def subset_samples(
             f"IDs that aren't in the callset: {missing_samples}\n"
             f'All callset sample IDs:{mt.s.collect()}'
         )
-        raise MatrixTableSampleSetError(message, missing_samples)
+        raise SeqrValidationError(message)
     logger.info(f'Subsetted to {subset_count} sample ids')
     mt = mt.semi_join_cols(sample_subset_ht)
     return mt.filter_rows(hl.agg.any(hl.is_defined(mt.GT)))
diff --git a/v03_pipeline/lib/misc/sample_ids_test.py b/v03_pipeline/lib/misc/sample_ids_test.py
index db264a1f9..770f5451e 100644
--- a/v03_pipeline/lib/misc/sample_ids_test.py
+++ b/v03_pipeline/lib/misc/sample_ids_test.py
@@ -3,10 +3,10 @@
 import hail as hl
 
 from v03_pipeline.lib.misc.sample_ids import (
-    MatrixTableSampleSetError,
     remap_sample_ids,
     subset_samples,
 )
+from v03_pipeline.lib.misc.validation import SeqrValidationError
 
 CALLSET_MT = hl.MatrixTable.from_parts(
     rows={'variants': [1, 2]},
@@ -76,7 +76,7 @@ def test_remap_sample_ids_remap_has_duplicate(self) -> None:
             key='s',
         )
 
-        with self.assertRaises(ValueError):
+        with self.assertRaises(SeqrValidationError):
             remap_sample_ids(
                 CALLSET_MT,
                 project_remap_ht,
@@ -99,7 +99,7 @@ def test_remap_sample_ids_remap_has_missing_samples(self) -> None:
             key='s',
         )
 
-        with self.assertRaises(MatrixTableSampleSetError):
+        with self.assertRaises(SeqrValidationError):
             remap_sample_ids(
                 CALLSET_MT,
                 project_remap_ht,
@@ -114,7 +114,7 @@ def test_subset_samples_zero_samples(self):
             key='s',
         )
 
-        with self.assertRaises(MatrixTableSampleSetError):
+        with self.assertRaises(SeqrValidationError):
             subset_samples(
                 CALLSET_MT,
                 sample_subset_ht,
@@ -132,7 +132,7 @@ def test_subset_samples_missing_samples(self):
             key='s',
         )
 
-        with self.assertRaises(MatrixTableSampleSetError):
+        with self.assertRaises(SeqrValidationError):
             subset_samples(
                 CALLSET_MT,
                 sample_subset_ht,
diff --git a/v03_pipeline/lib/tasks/validate_callset_test.py b/v03_pipeline/lib/tasks/validate_callset_test.py
index 8f3638376..a6d2b0377 100644
--- a/v03_pipeline/lib/tasks/validate_callset_test.py
+++ b/v03_pipeline/lib/tasks/validate_callset_test.py
@@ -83,5 +83,6 @@ def test_validate_callset_multiple_exceptions(
                         'Missing the following expected contigs:chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr20, chr21, chr22, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX',
                         'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants',
                     ],
+                    'failed_family_samples': {},
                 },
             )
diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py
index cc827df52..c544b4666 100644
--- a/v03_pipeline/lib/tasks/write_imported_callset.py
+++ b/v03_pipeline/lib/tasks/write_imported_callset.py
@@ -10,7 +10,6 @@
     split_multi_hts,
 )
 from v03_pipeline.lib.misc.validation import (
-    SeqrValidationError,
     validate_imported_field_types,
 )
 from v03_pipeline.lib.misc.vets import annotate_vets
@@ -24,7 +23,7 @@
 from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget
 from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask
 from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
-    WriteValidationErrorsForRunTask,
+    with_persisted_validation_errors,
 )
 
 
@@ -77,65 +76,56 @@ def requires(self) -> list[luigi.Task]:
             CallsetTask(self.callset_path),
         ]
 
+    @with_persisted_validation_errors
     def create_table(self) -> hl.MatrixTable:
-        try:
-            # NB: throws SeqrValidationError
-            mt = import_callset(
-                self.callset_path,
-                self.reference_genome,
-                self.dataset_type,
-            )
-            filters_path = None
-            if (
-                FeatureFlag.EXPECT_WES_FILTERS
-                and not self.skip_expect_filters
-                and self.dataset_type.expect_filters(
-                    self.sample_type,
-                )
-            ):
-                filters_path = valid_filters_path(
-                    self.dataset_type,
-                    self.sample_type,
-                    self.callset_path,
-                )
-                filters_ht = import_vcf(filters_path, self.reference_genome).rows()
-                mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters)
-            additional_row_fields = get_additional_row_fields(
-                mt,
-                self.dataset_type,
-                self.skip_check_sex_and_relatedness,
+        # NB: throws SeqrValidationError
+        mt = import_callset(
+            self.callset_path,
+            self.reference_genome,
+            self.dataset_type,
+        )
+        filters_path = None
+        if (
+            FeatureFlag.EXPECT_WES_FILTERS
+            and not self.skip_expect_filters
+            and self.dataset_type.expect_filters(
+                self.sample_type,
             )
-            # NB: throws SeqrValidationError
-            mt = select_relevant_fields(
-                mt,
+        ):
+            filters_path = valid_filters_path(
                 self.dataset_type,
-                additional_row_fields,
+                self.sample_type,
+                self.callset_path,
             )
-            # This validation isn't override-able by the skip option.
-            # If a field is the wrong type, the pipeline will likely hard-fail downstream.
+            filters_ht = import_vcf(filters_path, self.reference_genome).rows()
+            mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters)
+        additional_row_fields = get_additional_row_fields(
+            mt,
+            self.dataset_type,
+            self.skip_check_sex_and_relatedness,
+        )
+        # NB: throws SeqrValidationError
+        mt = select_relevant_fields(
+            mt,
+            self.dataset_type,
+            additional_row_fields,
+        )
+        # This validation isn't override-able by the skip option.
+        # If a field is the wrong type, the pipeline will likely hard-fail downstream.
+        # NB: throws SeqrValidationError
+        validate_imported_field_types(
+            mt,
+            self.dataset_type,
+            additional_row_fields,
+        )
+        if self.dataset_type.has_multi_allelic_variants:
             # NB: throws SeqrValidationError
-            validate_imported_field_types(
-                mt,
-                self.dataset_type,
-                additional_row_fields,
-            )
-            if self.dataset_type.has_multi_allelic_variants:
-                # NB: throws SeqrValidationError
-                mt = split_multi_hts(mt, self.skip_validation)
-            # Special handling of variant-level filter annotation for VETs filters.
-            # The annotations are present on the sample-level FT field but are
-            # expected upstream on "filters".
-            mt = annotate_vets(mt)
-            return mt.select_globals(
-                callset_path=self.callset_path,
-                filters_path=filters_path or hl.missing(hl.tstr),
-            )
-        except SeqrValidationError as e:
-            write_validation_errors_for_run_task = self.clone(
-                WriteValidationErrorsForRunTask,
-                error_messages=[str(e)],
-            )
-            write_validation_errors_for_run_task.run()
-            raise SeqrValidationError(
-                write_validation_errors_for_run_task.to_single_error_message(),
-            ) from e
+            mt = split_multi_hts(mt, self.skip_validation)
+        # Special handling of variant-level filter annotation for VETs filters.
+        # The annotations are present on the sample-level FT field but are
+        # expected upstream on "filters".
+        mt = annotate_vets(mt)
+        return mt.select_globals(
+            callset_path=self.callset_path,
+            filters_path=filters_path or hl.missing(hl.tstr),
+        )
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
index 41b4c148e..f099dcfcc 100644
--- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
+++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
@@ -2,7 +2,6 @@
 import luigi
 import luigi.util
 
-from v03_pipeline.lib.logger import get_logger
 from v03_pipeline.lib.misc.family_loading_failures import (
     get_families_failed_missing_samples,
     get_families_failed_relatedness_check,
@@ -16,6 +15,7 @@
 )
 from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families
 from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples
+from v03_pipeline.lib.misc.validation import SeqrValidationError
 from v03_pipeline.lib.model.feature_flag import FeatureFlag
 from v03_pipeline.lib.paths import (
     relatedness_check_table_path,
@@ -29,8 +29,19 @@
     WriteRelatednessCheckTsvTask,
 )
 from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask
+from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
+    with_persisted_validation_errors,
+)
+
 
-logger = get_logger(__name__)
+def format_failures(failed_families):
+    return {
+        f.family_guid: {
+            'samples': sorted(f.samples.keys()),
+            'reasons': reasons,
+        }
+        for f, reasons in failed_families.items()
+    }
 
 
 @luigi.util.inherits(BaseLoadingRunParams)
@@ -73,6 +84,7 @@ def requires(self) -> list[luigi.Task]:
             ]
         return requirements
 
+    @with_persisted_validation_errors
     def create_table(self) -> hl.MatrixTable:
         callset_mt = hl.read_matrix_table(self.input()[0].path)
         pedigree_ht = import_pedigree(self.input()[1].path)
@@ -130,16 +142,21 @@ def create_table(self) -> hl.MatrixTable:
             - families_failed_sex_check.keys()
         )
         if not len(loadable_families):
-            msg = (
-                f'families_failed_missing_samples: {families_failed_missing_samples}\n'
-                f'families_failed_relatedness_check: {families_failed_relatedness_check}\n'
-                f'families_failed_sex_check: {families_failed_sex_check}'
-            )
-            logger.info(
+            msg = 'All families failed validation checks'
+            raise SeqrValidationError(
                 msg,
+                {
+                    'failed_family_samples': {
+                        'missing_samples': format_failures(
+                            families_failed_missing_samples,
+                        ),
+                        'relatedness_check': format_failures(
+                            families_failed_relatedness_check,
+                        ),
+                        'sex_check': format_failures(families_failed_sex_check),
+                    },
+                },
             )
-            msg = 'All families failed checks'
-            raise RuntimeError(msg)
 
         mt = subset_samples(
             callset_mt,
@@ -172,33 +189,15 @@ def create_table(self) -> hl.MatrixTable:
             ),
             failed_family_samples=hl.Struct(
                 missing_samples=(
-                    {
-                        f.family_guid: {
-                            'samples': sorted(f.samples.keys()),
-                            'reasons': reasons,
-                        }
-                        for f, reasons in families_failed_missing_samples.items()
-                    }
+                    format_failures(families_failed_missing_samples)
                     or hl.empty_dict(hl.tstr, hl.tdict(hl.tstr, hl.tarray(hl.tstr)))
                 ),
                 relatedness_check=(
-                    {
-                        f.family_guid: {
-                            'samples': sorted(f.samples.keys()),
-                            'reasons': reasons,
-                        }
-                        for f, reasons in families_failed_relatedness_check.items()
-                    }
+                    format_failures(families_failed_relatedness_check)
                     or hl.empty_dict(hl.tstr, hl.tdict(hl.tstr, hl.tarray(hl.tstr)))
                 ),
                 sex_check=(
-                    {
-                        f.family_guid: {
-                            'samples': sorted(f.samples.keys()),
-                            'reasons': reasons,
-                        }
-                        for f, reasons in families_failed_sex_check.items()
-                    }
+                    format_failures(families_failed_sex_check)
                     or hl.empty_dict(hl.tstr, hl.tdict(hl.tstr, hl.tarray(hl.tstr)))
                 ),
             ),
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
index 5ac04c278..a5ed24799 100644
--- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
+++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
@@ -1,3 +1,4 @@
+import json
 import shutil
 from unittest.mock import Mock, patch
 
@@ -10,12 +11,16 @@
 from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import (
     WriteRemappedAndSubsettedCallsetTask,
 )
+from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
+    WriteValidationErrorsForRunTask,
+)
 from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
 
 TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
 TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv'
 TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
 TEST_PEDIGREE_4 = 'v03_pipeline/var/test/pedigrees/test_pedigree_4.tsv'
+TEST_PEDIGREE_7 = 'v03_pipeline/var/test/pedigrees/test_pedigree_7.tsv'
 TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht'
 TEST_RELATEDNESS_CHECK_1 = (
     'v03_pipeline/var/test/relatedness_check/test_relatedness_check_1.ht'
@@ -179,3 +184,159 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family(
                 ),
             ],
         )
+
+    @patch('v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset.FeatureFlag')
+    def test_write_remapped_and_subsetted_callset_task_all_families_failed(
+        self,
+        mock_ff: Mock,
+    ) -> None:
+        mock_ff.CHECK_SEX_AND_RELATEDNESS = True
+        worker = luigi.worker.Worker()
+        wrsc_task = WriteRemappedAndSubsettedCallsetTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SNV_INDEL,
+            run_id=TEST_RUN_ID,
+            sample_type=SampleType.WGS,
+            callset_path=TEST_VCF,
+            project_guids=['R0114_project4'],
+            project_remap_paths=[TEST_REMAP],
+            project_pedigree_paths=[TEST_PEDIGREE_7],
+            project_i=0,
+            skip_validation=True,
+        )
+        worker.add(wrsc_task)
+        worker.run()
+        self.assertFalse(wrsc_task.complete())
+        write_validation_errors_task = WriteValidationErrorsForRunTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SNV_INDEL,
+            sample_type=SampleType.WES,
+            callset_path=TEST_VCF,
+            project_guids=['R0114_project4'],
+            skip_validation=True,
+            run_id=TEST_RUN_ID,
+        )
+        self.assertTrue(write_validation_errors_task.complete())
+        with write_validation_errors_task.output().open('r') as f:
+            self.assertDictEqual(
+                json.load(f),
+                {
+                    'project_guids': [
+                        'R0114_project4',
+                    ],
+                    'error_messages': [
+                        'All families failed validation checks',
+                    ],
+                    'failed_family_samples': {
+                        'missing_samples': {
+                            'efg_1': {
+                                'samples': [
+                                    'NA99999_1',
+                                ],
+                                'reasons': [
+                                    "Missing samples: {'NA99999_1'}",
+                                ],
+                            },
+                        },
+                        'relatedness_check': {},
+                        'sex_check': {
+                            '789_1': {
+                                'samples': [
+                                    'NA20875_1',
+                                ],
+                                'reasons': [
+                                    'Sample NA20875_1 has pedigree sex M but imputed sex F',
+                                ],
+                            },
+                            '456_1': {
+                                'samples': [
+                                    'NA20870_1',
+                                ],
+                                'reasons': [
+                                    'Sample NA20870_1 has pedigree sex M but imputed sex F',
+                                ],
+                            },
+                            '123_1': {
+                                'samples': [
+                                    'NA19675_1',
+                                ],
+                                'reasons': [
+                                    'Sample NA19675_1 has pedigree sex M but imputed sex F',
+                                ],
+                            },
+                            'cde_1': {
+                                'samples': [
+                                    'NA20881_1',
+                                ],
+                                'reasons': [
+                                    'Sample NA20881_1 has pedigree sex F but imputed sex M',
+                                ],
+                            },
+                            '901_1': {
+                                'samples': [
+                                    'NA20877_1',
+                                ],
+                                'reasons': [
+                                    'Sample NA20877_1 has pedigree sex M but imputed sex F',
+                                ],
+                            },
+                            '678_1': {
+                                'samples': [
+                                    'NA20874_1',
+                                ],
+                                'reasons': [
+                                    'Sample NA20874_1 has pedigree sex M but imputed sex F',
+                                ],
+                            },
+                            '345_1': {
+                                'samples': [
+                                    'NA19679_1',
+                                ],
+                                'reasons': [
+                                    'Sample NA19679_1 has pedigree sex M but imputed sex F',
+                                ],
+                            },
+                            '890_1': {
+                                'samples': [
+                                    'NA20876_1',
+                                ],
+                                'reasons': [
+                                    'Sample NA20876_1 has pedigree sex M but imputed sex F',
+                                ],
+                            },
+                            'def_1': {
+                                'samples': [
+                                    'NA20885_1',
+                                ],
+                                'reasons': [
+                                    'Sample NA20885_1 has pedigree sex M but imputed sex F',
+                                ],
+                            },
+                            '234_1': {
+                                'samples': [
+                                    'NA19678_1',
+                                ],
+                                'reasons': [
+                                    'Sample NA19678_1 has pedigree sex F but imputed sex M',
+                                ],
+                            },
+                            'bcd_1': {
+                                'samples': [
+                                    'NA20878_1',
+                                ],
+                                'reasons': [
+                                    'Sample NA20878_1 has pedigree sex F but imputed sex M',
+                                ],
+                            },
+                            '567_1': {
+                                'samples': [
+                                    'NA20872_1',
+                                ],
+                                'reasons': [
+                                    'Sample NA20872_1 has pedigree sex F but imputed sex M',
+                                ],
+                            },
+                        },
+                    },
+                },
+            )
diff --git a/v03_pipeline/lib/tasks/write_validation_errors_for_run.py b/v03_pipeline/lib/tasks/write_validation_errors_for_run.py
index 9149f6158..d99800f6f 100644
--- a/v03_pipeline/lib/tasks/write_validation_errors_for_run.py
+++ b/v03_pipeline/lib/tasks/write_validation_errors_for_run.py
@@ -1,8 +1,11 @@
 import json
+from collections.abc import Callable
 
 import luigi
+import luigi.freezing
 import luigi.util
 
+from v03_pipeline.lib.misc.validation import SeqrValidationError
 from v03_pipeline.lib.paths import validation_errors_for_run_path
 from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
 from v03_pipeline.lib.tasks.files import GCSorLocalTarget
@@ -12,6 +15,7 @@
 class WriteValidationErrorsForRunTask(luigi.Task):
     project_guids = luigi.ListParameter()
     error_messages = luigi.ListParameter(default=[])
+    failed_family_samples = luigi.DictParameter(default={})
 
     def to_single_error_message(self) -> str:
         with self.output().open('r') as f:
@@ -33,6 +37,36 @@ def run(self) -> None:
         validation_errors_json = {
             'project_guids': self.project_guids,
             'error_messages': self.error_messages,
+            'failed_family_samples': luigi.freezing.recursively_unfreeze(
+                self.failed_family_samples,
+            ),
         }
         with self.output().open('w') as f:
             json.dump(validation_errors_json, f)
+
+
+def with_persisted_validation_errors(f: Callable) -> Callable[[Callable], Callable]:
+    def wrapper(self: luigi.Task):
+        try:
+            return f(self)
+        except SeqrValidationError as e:
+            if isinstance(
+                e.args[1],
+                object,
+            ):  # TODO: improve type checking with a pydantic model/typed dict
+                write_validation_errors_for_run_task = self.clone(
+                    WriteValidationErrorsForRunTask,
+                    error_messages=[str(e.args[0])],
+                    failed_family_samples=e.args[1]['failed_family_samples'],
+                )
+            else:
+                write_validation_errors_for_run_task = self.clone(
+                    WriteValidationErrorsForRunTask,
+                    error_messages=[str(e)],
+                )
+            write_validation_errors_for_run_task.run()
+            raise SeqrValidationError(
+                write_validation_errors_for_run_task.to_single_error_message(),
+            ) from None
+
+    return wrapper
diff --git a/v03_pipeline/var/test/pedigrees/test_pedigree_7.tsv b/v03_pipeline/var/test/pedigrees/test_pedigree_7.tsv
new file mode 100644
index 000000000..fee3a7458
--- /dev/null
+++ b/v03_pipeline/var/test/pedigrees/test_pedigree_7.tsv
@@ -0,0 +1,14 @@
+Project_GUID	Family_GUID	Family_ID	Individual_ID	Paternal_ID	Maternal_ID	Sex
+R0114_project4	123_1	123	NA19675_1			M
+R0114_project4	234_1	234	NA19678_1			F
+R0114_project4	345_1	345	NA19679_1			M
+R0114_project4	456_1	456	NA20870_1			M
+R0114_project4	567_1	567	NA20872_1			F
+R0114_project4	678_1	678	NA20874_1			M
+R0114_project4	789_1	789	NA20875_1			M
+R0114_project4	890_1	890	NA20876_1			M
+R0114_project4	901_1	901	NA20877_1			M
+R0114_project4	bcd_1	bcd	NA20878_1			F
+R0114_project4	cde_1	cde	NA20881_1			F
+R0114_project4	def_1	def	NA20885_1			M
+R0114_project4	efg_1	efg	NA99999_1			F

From 25db277c99c4d9631d44c7b5229f613a1d24d26c Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <b.p.blankenmeister@gmail.com>
Date: Mon, 6 Jan 2025 13:57:30 -0500
Subject: [PATCH 07/10] feat: Add ability to run tasks dataproc. (#948)

* Support gcs dirs in rsync

* ws

* Add create dataproc cluster task

* add dataproc

* ruff

* requirements

* still struggling

* Gencode refactor to remove gcs

* bump reqs

* Run dataproc job

* lib

* running

* merge requirements

* Flip'em

* Better exception handling

* Cleaner approach if less generalizable

* write a test

* Fix tests

* lint

* Add test for success

* refactor to use a base class... better for adding support for multiple jobs

* cleanup

* ruff

* Fix missing mock

* Fix flapping test

* pr comments
---
 requirements.txt                              |   2 +-
 .../dataproc/base_run_job_on_dataproc.py      | 106 ++++++++++++
 .../tasks/dataproc/create_dataproc_cluster.py |  36 ++--
 .../dataproc/create_dataproc_cluster_test.py  |  12 +-
 v03_pipeline/lib/tasks/dataproc/misc.py       |  21 +++
 v03_pipeline/lib/tasks/dataproc/misc_test.py  |  58 +++++++
 .../write_success_file_on_dataproc.py         |  22 +++
 .../write_success_file_on_dataproc_test.py    | 156 ++++++++++++++++++
 8 files changed, 392 insertions(+), 21 deletions(-)
 create mode 100644 v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py
 create mode 100644 v03_pipeline/lib/tasks/dataproc/misc.py
 create mode 100644 v03_pipeline/lib/tasks/dataproc/misc_test.py
 create mode 100644 v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py
 create mode 100644 v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc_test.py

diff --git a/requirements.txt b/requirements.txt
index 86e9f058b..96f6fb725 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --resolver=backtracking requirements.in
+#    pip-compile requirements.in
 #
 aiodns==2.0.0
     # via hail
diff --git a/v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py b/v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py
new file mode 100644
index 000000000..1094d651e
--- /dev/null
+++ b/v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py
@@ -0,0 +1,106 @@
+import time
+
+import google.api_core.exceptions
+import luigi
+from google.cloud import dataproc_v1 as dataproc
+
+from v03_pipeline.lib.logger import get_logger
+from v03_pipeline.lib.model import Env
+from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import (
+    BaseLoadingPipelineParams,
+)
+from v03_pipeline.lib.tasks.dataproc.create_dataproc_cluster import (
+    CreateDataprocClusterTask,
+)
+from v03_pipeline.lib.tasks.dataproc.misc import get_cluster_name, to_kebab_str_args
+
+DONE_STATE = 'DONE'
+ERROR_STATE = 'ERROR'
+SEQR_PIPELINE_RUNNER_BUILD = f'gs://seqr-pipeline-runner-builds/{Env.DEPLOYMENT_TYPE}/{Env.PIPELINE_RUNNER_APP_VERSION}'
+TIMEOUT_S = 172800  # 2 days
+
+logger = get_logger(__name__)
+
+
+@luigi.util.inherits(BaseLoadingPipelineParams)
+class BaseRunJobOnDataprocTask(luigi.Task):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.client = dataproc.JobControllerClient(
+            client_options={
+                'api_endpoint': f'{Env.GCLOUD_REGION}-dataproc.googleapis.com:443',
+            },
+        )
+
+    @property
+    def task_name(self):
+        return self.get_task_family().split('.')[-1]
+
+    @property
+    def job_id(self):
+        return f'{self.task_name}-{self.run_id}'
+
+    def requires(self) -> [luigi.Task]:
+        return [self.clone(CreateDataprocClusterTask)]
+
+    def complete(self) -> bool:
+        if not self.dataset_type.requires_dataproc:
+            msg = f'{self.dataset_type} should not require a dataproc job'
+            raise RuntimeError(msg)
+        try:
+            job = self.client.get_job(
+                request={
+                    'project_id': Env.GCLOUD_PROJECT,
+                    'region': Env.GCLOUD_REGION,
+                    'job_id': self.job_id,
+                },
+            )
+        except google.api_core.exceptions.NotFound:
+            return False
+        if job.status.state == ERROR_STATE:
+            msg = f'Job {self.task_name}-{self.run_id} entered ERROR state'
+            logger.error(msg)
+            logger.error(job.status.details)
+        return job.status.state == DONE_STATE
+
+    def run(self):
+        operation = self.client.submit_job_as_operation(
+            request={
+                'project_id': Env.GCLOUD_PROJECT,
+                'region': Env.GCLOUD_REGION,
+                'job': {
+                    'reference': {
+                        'job_id': self.job_id,
+                    },
+                    'placement': {
+                        'cluster_name': get_cluster_name(
+                            self.reference_genome,
+                            self.run_id,
+                        ),
+                    },
+                    'pyspark_job': {
+                        'main_python_file_uri': f'{SEQR_PIPELINE_RUNNER_BUILD}/bin/run_task.py',
+                        'args': [
+                            self.task_name,
+                            '--local-scheduler',
+                            *to_kebab_str_args(self),
+                        ],
+                        'python_file_uris': [
+                            f'{SEQR_PIPELINE_RUNNER_BUILD}/pyscripts.zip',
+                        ],
+                    },
+                },
+            },
+        )
+        wait_s = 0
+        while wait_s < TIMEOUT_S:
+            if operation.done():
+                operation.result()  # Will throw on failure!
+                msg = f'Finished {self.job_id}'
+                logger.info(msg)
+                break
+            logger.info(
+                f'Waiting for job completion {self.job_id}',
+            )
+            time.sleep(3)
+            wait_s += 3
diff --git a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py
index 27693c5d3..6fce572a0 100644
--- a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py
+++ b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py
@@ -1,5 +1,6 @@
 import time
 
+import google.api_core.exceptions
 import hail as hl
 import luigi
 from google.cloud import dataproc_v1 as dataproc
@@ -11,13 +12,15 @@
 from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import (
     BaseLoadingPipelineParams,
 )
+from v03_pipeline.lib.tasks.dataproc.misc import get_cluster_name
 
-CLUSTER_NAME_PREFIX = 'pipeline-runner'
 DEBIAN_IMAGE = '2.1.33-debian11'
+ERROR_STATE = 'ERROR'
 HAIL_VERSION = hl.version().split('-')[0]
 INSTANCE_TYPE = 'n1-highmem-8'
 PKGS = '|'.join(pip_freeze.freeze())
-SUCCESS_STATE = 'RUNNING'
+RUNNING_STATE = 'RUNNING'
+TIMEOUT_S = 900
 
 logger = get_logger(__name__)
 
@@ -26,7 +29,7 @@ def get_cluster_config(reference_genome: ReferenceGenome, run_id: str):
     service_account_credentials = get_service_account_credentials()
     return {
         'project_id': Env.GCLOUD_PROJECT,
-        'cluster_name': f'{CLUSTER_NAME_PREFIX}-{reference_genome.value.lower()}-{run_id}',
+        'cluster_name': get_cluster_name(reference_genome, run_id),
         # Schema found at https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig
         'config': {
             'gce_cluster_config': {
@@ -136,27 +139,32 @@ def __init__(self, *args, **kwargs):
         # https://cloud.google.com/dataproc/docs/tutorials/python-library-example
         self.client = dataproc.ClusterControllerClient(
             client_options={
-                'api_endpoint': f'{Env.GCLOUD_REGION}-dataproc.googleapis.com:443'.format(
-                    Env.GCLOUD_REGION,
-                ),
+                'api_endpoint': f'{Env.GCLOUD_REGION}-dataproc.googleapis.com:443',
             },
         )
 
     def complete(self) -> bool:
         if not self.dataset_type.requires_dataproc:
-            return True
+            msg = f'{self.dataset_type} should not require a dataproc cluster'
+            raise RuntimeError(msg)
         try:
-            client = self.client.get_cluster(
+            cluster = self.client.get_cluster(
                 request={
                     'project_id': Env.GCLOUD_PROJECT,
                     'region': Env.GCLOUD_REGION,
-                    'cluster_name': f'{CLUSTER_NAME_PREFIX}-{self.reference_genome.value.lower()}',
+                    'cluster_name': get_cluster_name(
+                        self.reference_genome,
+                        self.run_id,
+                    ),
                 },
             )
-        except Exception:  # noqa: BLE001
+        except google.api_core.exceptions.NotFound:
             return False
-        else:
-            return client.status.state == SUCCESS_STATE
+        if cluster.status.state == ERROR_STATE:
+            msg = f'Cluster {cluster.cluster_name} entered ERROR state'
+            logger.error(msg)
+        # This will return False when the cluster is "CREATING"
+        return cluster.status.state == RUNNING_STATE
 
     def run(self):
         operation = self.client.create_cluster(
@@ -166,7 +174,8 @@ def run(self):
                 'cluster': get_cluster_config(self.reference_genome, self.run_id),
             },
         )
-        while True:
+        wait_s = 0
+        while wait_s < TIMEOUT_S:
             if operation.done():
                 result = operation.result()  # Will throw on failure!
                 msg = f'Created cluster {result.cluster_name} with cluster uuid: {result.cluster_uuid}'
@@ -174,3 +183,4 @@ def run(self):
                 break
             logger.info('Waiting for cluster spinup')
             time.sleep(3)
+            wait_s += 3
diff --git a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py
index c7f4f2958..e3fa8f3fa 100644
--- a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py
+++ b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py
@@ -28,15 +28,12 @@ def test_dataset_type_unsupported(
         mock_cluster_controller: Mock,
         _: Mock,
     ) -> None:
-        worker = luigi.worker.Worker()
         task = CreateDataprocClusterTask(
             reference_genome=ReferenceGenome.GRCh38,
             dataset_type=DatasetType.MITO,
             run_id='1',
         )
-        worker.add(task)
-        worker.run()
-        self.assertTrue(task.complete())
+        self.assertRaises(RuntimeError, task.complete)
 
     def test_spinup_cluster_already_exists_failed(
         self,
@@ -45,7 +42,8 @@ def test_spinup_cluster_already_exists_failed(
     ) -> None:
         mock_client = mock_cluster_controller.return_value
         mock_client.get_cluster.return_value = SimpleNamespace(
-            status=SimpleNamespace(state='FAILED'),
+            status=SimpleNamespace(state='ERROR'),
+            cluster_name='abc',
         )
         mock_client.create_cluster.side_effect = (
             google.api_core.exceptions.AlreadyExists('cluster exists')
@@ -122,7 +120,7 @@ def test_spinup_cluster_doesnt_exist_success(
         operation = mock_client.create_cluster.return_value
         operation.done.side_effect = [False, True]
         operation.result.return_value = SimpleNamespace(
-            cluster_name='dataproc-cluster-1',
+            cluster_name='dataproc-cluster-5',
             cluster_uuid='12345',
         )
         worker = luigi.worker.Worker()
@@ -136,6 +134,6 @@ def test_spinup_cluster_doesnt_exist_success(
         mock_logger.info.assert_has_calls(
             [
                 call('Waiting for cluster spinup'),
-                call('Created cluster dataproc-cluster-1 with cluster uuid: 12345'),
+                call('Created cluster dataproc-cluster-5 with cluster uuid: 12345'),
             ],
         )
diff --git a/v03_pipeline/lib/tasks/dataproc/misc.py b/v03_pipeline/lib/tasks/dataproc/misc.py
new file mode 100644
index 000000000..672f86dcc
--- /dev/null
+++ b/v03_pipeline/lib/tasks/dataproc/misc.py
@@ -0,0 +1,21 @@
+import re
+
+import luigi
+
+from v03_pipeline.lib.model import ReferenceGenome
+
+CLUSTER_NAME_PREFIX = 'pipeline-runner'
+
+
+def get_cluster_name(reference_genome: ReferenceGenome, run_id: str):
+    return f'{CLUSTER_NAME_PREFIX}-{reference_genome.value.lower()}-{run_id}'
+
+
+def snake_to_kebab_arg(snake_string: str) -> str:
+    return '--' + re.sub(r'\_', '-', snake_string).lower()
+
+
+def to_kebab_str_args(task: luigi.Task):
+    return [
+        e for k, v in task.to_str_params().items() for e in (snake_to_kebab_arg(k), v)
+    ]
diff --git a/v03_pipeline/lib/tasks/dataproc/misc_test.py b/v03_pipeline/lib/tasks/dataproc/misc_test.py
new file mode 100644
index 000000000..335cacbf7
--- /dev/null
+++ b/v03_pipeline/lib/tasks/dataproc/misc_test.py
@@ -0,0 +1,58 @@
+import unittest
+from unittest.mock import Mock, patch
+
+from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
+from v03_pipeline.lib.tasks.dataproc.misc import to_kebab_str_args
+from v03_pipeline.lib.tasks.dataproc.write_success_file_on_dataproc import (
+    WriteSuccessFileOnDataprocTask,
+)
+
+
+@patch(
+    'v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc.dataproc.JobControllerClient',
+)
+class MiscTest(unittest.TestCase):
+    def test_to_kebab_str_args(self, _: Mock):
+        t = WriteSuccessFileOnDataprocTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SNV_INDEL,
+            sample_type=SampleType.WGS,
+            callset_path='test_callset',
+            project_guids=['R0113_test_project'],
+            project_remap_paths=['test_remap'],
+            project_pedigree_paths=['test_pedigree'],
+            run_id='a_misc_run',
+        )
+        self.assertListEqual(
+            to_kebab_str_args(t),
+            [
+                '--reference-genome',
+                'GRCh38',
+                '--dataset-type',
+                'SNV_INDEL',
+                '--run-id',
+                'a_misc_run',
+                '--sample-type',
+                'WGS',
+                '--callset-path',
+                'test_callset',
+                '--project-guids',
+                '["R0113_test_project"]',
+                '--project-remap-paths',
+                '["test_remap"]',
+                '--project-pedigree-paths',
+                '["test_pedigree"]',
+                '--ignore-missing-samples-when-remapping',
+                'False',
+                '--skip-check-sex-and-relatedness',
+                'False',
+                '--skip-expect-filters',
+                'False',
+                '--skip-expect-tdr-metrics',
+                'False',
+                '--skip-validation',
+                'False',
+                '--is-new-gcnv-joint-call',
+                'False',
+            ],
+        )
diff --git a/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py b/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py
new file mode 100644
index 000000000..0c48c344f
--- /dev/null
+++ b/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py
@@ -0,0 +1,22 @@
+import luigi
+
+from v03_pipeline.lib.paths import pipeline_run_success_file_path
+from v03_pipeline.lib.tasks.base.base_loading_run_params import (
+    BaseLoadingRunParams,
+)
+from v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc import (
+    BaseRunJobOnDataprocTask,
+)
+from v03_pipeline.lib.tasks.files import GCSorLocalTarget
+
+
+@luigi.util.inherits(BaseLoadingRunParams)
+class WriteSuccessFileOnDataprocTask(BaseRunJobOnDataprocTask):
+    def output(self) -> luigi.Target:
+        return GCSorLocalTarget(
+            pipeline_run_success_file_path(
+                self.reference_genome,
+                self.dataset_type,
+                self.run_id,
+            ),
+        )
diff --git a/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc_test.py b/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc_test.py
new file mode 100644
index 000000000..62cb4d678
--- /dev/null
+++ b/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc_test.py
@@ -0,0 +1,156 @@
+import unittest
+from types import SimpleNamespace
+from unittest.mock import Mock, call, patch
+
+import google.api_core.exceptions
+import luigi
+
+from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
+from v03_pipeline.lib.tasks.dataproc.write_success_file_on_dataproc import (
+    WriteSuccessFileOnDataprocTask,
+)
+from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
+
+
+@patch(
+    'v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc.CreateDataprocClusterTask',
+)
+@patch(
+    'v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc.dataproc.JobControllerClient',
+)
+class WriteSuccessFileOnDataprocTaskTest(unittest.TestCase):
+    @patch('v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc.logger')
+    def test_job_already_exists_failed(
+        self,
+        mock_logger: Mock,
+        mock_job_controller_client: Mock,
+        mock_create_dataproc_cluster: Mock,
+    ) -> None:
+        mock_create_dataproc_cluster.return_value = MockCompleteTask()
+        mock_client = mock_job_controller_client.return_value
+        mock_client.get_job.return_value = SimpleNamespace(
+            status=SimpleNamespace(
+                state='ERROR',
+                details='Google Cloud Dataproc Agent reports job failure. If logs are available, they can be found at...',
+            ),
+        )
+        mock_client.submit_job_as_operation.side_effect = (
+            google.api_core.exceptions.AlreadyExists('job exists')
+        )
+        worker = luigi.worker.Worker()
+        task = WriteSuccessFileOnDataprocTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SNV_INDEL,
+            sample_type=SampleType.WGS,
+            callset_path='test_callset',
+            project_guids=['R0113_test_project'],
+            project_remap_paths=['test_remap'],
+            project_pedigree_paths=['test_pedigree'],
+            run_id='manual__2024-04-03',
+        )
+        worker.add(task)
+        worker.run()
+        self.assertFalse(task.complete())
+        mock_logger.error.assert_has_calls(
+            [
+                call(
+                    'Job WriteSuccessFileOnDataprocTask-manual__2024-04-03 entered ERROR state',
+                ),
+            ],
+        )
+
+    def test_job_already_exists_success(
+        self,
+        mock_job_controller_client: Mock,
+        mock_create_dataproc_cluster: Mock,
+    ) -> None:
+        mock_create_dataproc_cluster.return_value = MockCompleteTask()
+        mock_client = mock_job_controller_client.return_value
+        mock_client.get_job.return_value = SimpleNamespace(
+            status=SimpleNamespace(state='DONE'),
+        )
+        worker = luigi.worker.Worker()
+        task = WriteSuccessFileOnDataprocTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SNV_INDEL,
+            sample_type=SampleType.WGS,
+            callset_path='test_callset',
+            project_guids=['R0113_test_project'],
+            project_remap_paths=['test_remap'],
+            project_pedigree_paths=['test_pedigree'],
+            run_id='manual__2024-04-04',
+        )
+        worker.add(task)
+        worker.run()
+        self.assertTrue(task.complete())
+
+    @patch('v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc.logger')
+    def test_job_failed(
+        self,
+        mock_logger: Mock,
+        mock_job_controller_client: Mock,
+        mock_create_dataproc_cluster: Mock,
+    ) -> None:
+        mock_create_dataproc_cluster.return_value = MockCompleteTask()
+        mock_client = mock_job_controller_client.return_value
+        mock_client.get_job.side_effect = google.api_core.exceptions.NotFound(
+            'job not found',
+        )
+        operation = mock_client.submit_job_as_operation.return_value
+        operation.done.side_effect = [False, True]
+        operation.result.side_effect = Exception(
+            'FailedPrecondition: 400 Job failed with message',
+        )
+        worker = luigi.worker.Worker()
+        task = WriteSuccessFileOnDataprocTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SNV_INDEL,
+            sample_type=SampleType.WGS,
+            callset_path='test_callset',
+            project_guids=['R0113_test_project'],
+            project_remap_paths=['test_remap'],
+            project_pedigree_paths=['test_pedigree'],
+            run_id='manual__2024-04-05',
+        )
+        worker.add(task)
+        worker.run()
+        self.assertFalse(task.complete())
+        mock_logger.info.assert_has_calls(
+            [
+                call(
+                    'Waiting for job completion WriteSuccessFileOnDataprocTask-manual__2024-04-05',
+                ),
+            ],
+        )
+
+    def test_job_success(
+        self,
+        mock_job_controller_client: Mock,
+        mock_create_dataproc_cluster: Mock,
+    ) -> None:
+        mock_create_dataproc_cluster.return_value = MockCompleteTask()
+        mock_client = mock_job_controller_client.return_value
+        mock_client.get_job.side_effect = [
+            google.api_core.exceptions.NotFound(
+                'job not found',
+            ),
+            SimpleNamespace(
+                status=SimpleNamespace(state='DONE'),
+            ),
+        ]
+        operation = mock_client.submit_job_as_operation.return_value
+        operation.done.side_effect = [False, True]
+        worker = luigi.worker.Worker()
+        task = WriteSuccessFileOnDataprocTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SNV_INDEL,
+            sample_type=SampleType.WGS,
+            callset_path='test_callset',
+            project_guids=['R0113_test_project'],
+            project_remap_paths=['test_remap'],
+            project_pedigree_paths=['test_pedigree'],
+            run_id='manual__2024-04-06',
+        )
+        worker.add(task)
+        worker.run()
+        self.assertTrue(task.complete())

From db64b2d12719ddeabe40ad12a78055cc13bc31e6 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <b.p.blankenmeister@gmail.com>
Date: Fri, 10 Jan 2025 14:02:57 -0500
Subject: [PATCH 08/10] feat: Add rsync task between hail search data/reference
 data stored in gcs and on local disk.   (#1006)

* Support gcs dirs in rsync

* ws

* Add create dataproc cluster task

* add dataproc

* ruff

* requirements

* still struggling

* Gencode refactor to remove gcs

* bump reqs

* Run dataproc job

* lib

* running

* merge requirements

* Flip'em

* Better exception handling

* Cleaner approach if less generalizable

* write a test

* Fix tests

* lint

* Add test for success

* refactor to use a base class... better for adding support for multiple jobs

* cleanup

* ruff

* Fix missing mock

* Fix flapping test

* first commit

* Finish test and cleanup

* Allow any order
---
 v03_pipeline/lib/model/environment.py         |   4 +
 v03_pipeline/lib/paths.py                     |  62 ++++-------
 .../reference_datasets/reference_dataset.py   |  15 +++
 .../tasks/dataproc/rsync_to_seqr_app_dirs.py  |  95 ++++++++++++++++
 .../dataproc/rsync_to_seqr_app_dirs_test.py   | 101 ++++++++++++++++++
 5 files changed, 237 insertions(+), 40 deletions(-)
 create mode 100644 v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py
 create mode 100644 v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py

diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py
index 42ae4c658..72c8c5a45 100644
--- a/v03_pipeline/lib/model/environment.py
+++ b/v03_pipeline/lib/model/environment.py
@@ -50,6 +50,8 @@
 GCLOUD_ZONE = os.environ.get('GCLOUD_ZONE')
 GCLOUD_REGION = os.environ.get('GCLOUD_REGION')
 PIPELINE_RUNNER_APP_VERSION = os.environ.get('PIPELINE_RUNNER_APP_VERSION', 'latest')
+SEQR_APP_HAIL_SEARCH_DATA_DIR = os.environ.get('SEQR_APP_HAIL_SEARCH_DATA_DIR')
+SEQR_APP_REFERENCE_DATASETS_DIR = os.environ.get('SEQR_APP_REFERENCE_DATASETS_DIR')
 
 
 @dataclass
@@ -71,4 +73,6 @@ class Env:
     PIPELINE_RUNNER_APP_VERSION: str = PIPELINE_RUNNER_APP_VERSION
     PRIVATE_REFERENCE_DATASETS_DIR: str = PRIVATE_REFERENCE_DATASETS_DIR
     REFERENCE_DATASETS_DIR: str = REFERENCE_DATASETS_DIR
+    SEQR_APP_HAIL_SEARCH_DATA_DIR: str | None = SEQR_APP_HAIL_SEARCH_DATA_DIR
+    SEQR_APP_REFERENCE_DATASETS_DIR: str | None = SEQR_APP_REFERENCE_DATASETS_DIR
     VEP_REFERENCE_DATASETS_DIR: str = VEP_REFERENCE_DATASETS_DIR
diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py
index 2942b1dae..87aa024cf 100644
--- a/v03_pipeline/lib/paths.py
+++ b/v03_pipeline/lib/paths.py
@@ -17,7 +17,7 @@
 )
 
 
-def _pipeline_prefix(
+def pipeline_prefix(
     root: str,
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,
@@ -36,38 +36,15 @@ def _pipeline_prefix(
     )
 
 
-def _v03_reference_data_prefix(
-    access_control: AccessControl,
-    reference_genome: ReferenceGenome,
-    dataset_type: DatasetType,
-) -> str:
-    root = (
-        Env.PRIVATE_REFERENCE_DATASETS_DIR
-        if access_control == AccessControl.PRIVATE
-        else Env.REFERENCE_DATASETS_DIR
-    )
-    if FeatureFlag.INCLUDE_PIPELINE_VERSION_IN_PREFIX:
-        return os.path.join(
-            root,
-            PipelineVersion.V03.value,
-            reference_genome.value,
-            dataset_type.value,
-        )
-    return os.path.join(
-        root,
-        reference_genome.value,
-        dataset_type.value,
-    )
-
-
 def _v03_reference_dataset_prefix(
+    root: str,
     access_control: AccessControl,
     reference_genome: ReferenceGenome,
 ) -> str:
     root = (
         Env.PRIVATE_REFERENCE_DATASETS_DIR
         if access_control == AccessControl.PRIVATE
-        else Env.REFERENCE_DATASETS_DIR
+        else root
     )
     if FeatureFlag.INCLUDE_PIPELINE_VERSION_IN_PREFIX:
         return os.path.join(
@@ -88,7 +65,7 @@ def family_table_path(
     family_guid: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.HAIL_SEARCH_DATA_DIR,
             reference_genome,
             dataset_type,
@@ -104,7 +81,7 @@ def tdr_metrics_dir(
     dataset_type: DatasetType,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.LOADING_DATASETS_DIR,
             reference_genome,
             dataset_type,
@@ -130,7 +107,7 @@ def imported_callset_path(
     callset_path: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.LOADING_DATASETS_DIR,
             reference_genome,
             dataset_type,
@@ -177,7 +154,7 @@ def project_table_path(
     project_guid: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.HAIL_SEARCH_DATA_DIR,
             reference_genome,
             dataset_type,
@@ -194,7 +171,7 @@ def relatedness_check_table_path(
     callset_path: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.LOADING_DATASETS_DIR,
             reference_genome,
             dataset_type,
@@ -210,7 +187,7 @@ def relatedness_check_tsv_path(
     callset_path: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.LOADING_DATASETS_DIR,
             reference_genome,
             dataset_type,
@@ -227,7 +204,7 @@ def remapped_and_subsetted_callset_path(
     project_guid: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.LOADING_DATASETS_DIR,
             reference_genome,
             dataset_type,
@@ -243,7 +220,7 @@ def lookup_table_path(
     dataset_type: DatasetType,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.HAIL_SEARCH_DATA_DIR,
             reference_genome,
             dataset_type,
@@ -257,7 +234,7 @@ def runs_path(
     dataset_type: DatasetType,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.HAIL_SEARCH_DATA_DIR,
             reference_genome,
             dataset_type,
@@ -272,7 +249,7 @@ def sex_check_table_path(
     callset_path: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.LOADING_DATASETS_DIR,
             reference_genome,
             dataset_type,
@@ -306,6 +283,7 @@ def valid_reference_dataset_path(
 ) -> str | None:
     return os.path.join(
         _v03_reference_dataset_prefix(
+            Env.REFERENCE_DATASETS_DIR,
             reference_dataset.access_control,
             reference_genome,
         ),
@@ -318,9 +296,13 @@ def valid_reference_dataset_query_path(
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,
     reference_dataset_query: ReferenceDatasetQuery,
+    root=None,
 ) -> str | None:
+    if not root:
+        root = Env.REFERENCE_DATASETS_DIR
     return os.path.join(
         _v03_reference_dataset_prefix(
+            root,
             reference_dataset_query.access_control,
             reference_genome,
         ),
@@ -334,7 +316,7 @@ def variant_annotations_table_path(
     dataset_type: DatasetType,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.HAIL_SEARCH_DATA_DIR,
             reference_genome,
             dataset_type,
@@ -348,7 +330,7 @@ def variant_annotations_vcf_path(
     dataset_type: DatasetType,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.HAIL_SEARCH_DATA_DIR,
             reference_genome,
             dataset_type,
@@ -386,7 +368,7 @@ def project_remap_path(
     project_guid: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.LOADING_DATASETS_DIR,
             reference_genome,
             dataset_type,
@@ -404,7 +386,7 @@ def project_pedigree_path(
     project_guid: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.LOADING_DATASETS_DIR,
             reference_genome,
             dataset_type,
diff --git a/v03_pipeline/lib/reference_datasets/reference_dataset.py b/v03_pipeline/lib/reference_datasets/reference_dataset.py
index 445e191aa..0ca80f8cb 100644
--- a/v03_pipeline/lib/reference_datasets/reference_dataset.py
+++ b/v03_pipeline/lib/reference_datasets/reference_dataset.py
@@ -166,6 +166,21 @@ class ReferenceDatasetQuery(BaseReferenceDataset, str, Enum):
     clinvar_path_variants = 'clinvar_path_variants'
     high_af_variants = 'high_af_variants'
 
+    @classmethod
+    def for_reference_genome_dataset_type(
+        cls,
+        reference_genome: ReferenceGenome,
+        dataset_type: DatasetType,
+    ) -> set['ReferenceDatasetQuery']:
+        return {
+            dataset
+            for dataset in super().for_reference_genome_dataset_type(
+                reference_genome,
+                dataset_type,
+            )
+            if isinstance(dataset, cls)
+        }
+
     @property
     def requires(self) -> ReferenceDataset:
         return {
diff --git a/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py
new file mode 100644
index 000000000..1f5e2e48c
--- /dev/null
+++ b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py
@@ -0,0 +1,95 @@
+import os
+import subprocess
+
+import luigi
+
+from v03_pipeline.lib.model import Env
+from v03_pipeline.lib.paths import pipeline_prefix, valid_reference_dataset_query_path
+from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDatasetQuery
+from v03_pipeline.lib.tasks.base.base_loading_run_params import (
+    BaseLoadingRunParams,
+)
+
+
+def hail_search_value(value: str) -> str:
+    return value.replace('SV', 'SV_WGS').replace(
+        'GCNV',
+        'SV_WES',
+    )
+
+
+def rsync_command(src_path: str, dst_path: str) -> list[str]:
+    return [
+        '/bin/bash',
+        '-cx',
+        f'mkdir -p {dst_path} && gsutil -qm rsync -rd -x .*runs.* {src_path} {dst_path} && sync {dst_path}',
+    ]
+
+
+@luigi.util.inherits(BaseLoadingRunParams)
+class RsyncToSeqrAppDirsTask(luigi.Task):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.done = False
+
+    def output(self) -> None:
+        return None
+
+    def complete(self) -> bool:
+        return self.done
+
+    def run(self) -> None:
+        if not (
+            Env.SEQR_APP_HAIL_SEARCH_DATA_DIR and Env.SEQR_APP_REFERENCE_DATASETS_DIR
+        ):
+            self.done = True
+            return
+
+        if not (
+            Env.HAIL_SEARCH_DATA_DIR.startswith('gs://')
+            and Env.REFERENCE_DATASETS_DIR.startswith('gs://')
+        ):
+            msg = 'Overridden HAIL_SEARCH_DATA_DIR and REFERENCE_DATASETS_DIR must be Google Cloud buckets.'
+            raise RuntimeError(msg)
+
+        # Sync Pipeline Tables
+        src_path = pipeline_prefix(
+            Env.HAIL_SEARCH_DATA_DIR,
+            self.reference_genome,
+            self.dataset_type,
+        )
+        dst_path = hail_search_value(
+            pipeline_prefix(
+                Env.SEQR_APP_HAIL_SEARCH_DATA_DIR,
+                self.reference_genome,
+                self.dataset_type,
+            ),
+        )
+        subprocess.call(
+            rsync_command(src_path, dst_path),  # noqa: S603
+        )
+
+        # Sync RDQs
+        for query in ReferenceDatasetQuery.for_reference_genome_dataset_type(
+            self.reference_genome,
+            self.dataset_type,
+        ):
+            src_path = valid_reference_dataset_query_path(
+                self.reference_genome,
+                self.dataset_type,
+                query,
+            )
+            dst_path = os.path.join(
+                hail_search_value(
+                    valid_reference_dataset_query_path(
+                        self.reference_genome,
+                        self.dataset_type,
+                        query,
+                        Env.SEQR_APP_REFERENCE_DATASETS_DIR,
+                    ),
+                ),
+            )
+            subprocess.call(
+                rsync_command(src_path, dst_path),  # noqa: S603
+            )
+        self.done = True
diff --git a/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py
new file mode 100644
index 000000000..225cfa93c
--- /dev/null
+++ b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py
@@ -0,0 +1,101 @@
+import unittest
+from unittest.mock import Mock, call, patch
+
+import luigi
+
+from v03_pipeline.lib.model import (
+    DatasetType,
+    Env,
+    FeatureFlag,
+    ReferenceGenome,
+    SampleType,
+)
+from v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs import (
+    RsyncToSeqrAppDirsTask,
+)
+
+
+class RsyncToSeqrAppDirsTaskTest(unittest.TestCase):
+    @patch('v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs.subprocess')
+    def test_rsync_to_seqr_app_dirs_no_sync(
+        self,
+        mock_subprocess: Mock,
+    ) -> None:
+        worker = luigi.worker.Worker()
+        task = RsyncToSeqrAppDirsTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SNV_INDEL,
+            sample_type=SampleType.WGS,
+            callset_path='test_callset',
+            project_guids=['R0113_test_project'],
+            project_remap_paths=['test_remap'],
+            project_pedigree_paths=['test_pedigree'],
+            run_id='manual__2024-04-01',
+        )
+        worker.add(task)
+        worker.run()
+        self.assertTrue(task.complete())
+        mock_subprocess.call.assert_not_called()
+
+    @patch('v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs.subprocess')
+    @patch.object(Env, 'HAIL_SEARCH_DATA_DIR', 'gs://test-hail-search-dir')
+    @patch.object(Env, 'REFERENCE_DATASETS_DIR', 'gs://test-reference-data-dir')
+    @patch.object(
+        Env,
+        'SEQR_APP_HAIL_SEARCH_DATA_DIR',
+        '/var/seqr/seqr-hail-search-data',
+    )
+    @patch.object(
+        Env,
+        'SEQR_APP_REFERENCE_DATASETS_DIR',
+        '/var/seqr/seqr-reference-data',
+    )
+    @patch.object(
+        FeatureFlag,
+        'INCLUDE_PIPELINE_VERSION_IN_PREFIX',
+        False,
+    )
+    def test_rsync_to_seqr_app_dirs_sync(
+        self,
+        mock_subprocess: Mock,
+    ) -> None:
+        worker = luigi.worker.Worker()
+        task = RsyncToSeqrAppDirsTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SNV_INDEL,
+            sample_type=SampleType.WGS,
+            callset_path='test_callset',
+            project_guids=['R0113_test_project'],
+            project_remap_paths=['test_remap'],
+            project_pedigree_paths=['test_pedigree'],
+            run_id='manual__2024-04-02',
+        )
+        worker.add(task)
+        worker.run()
+        self.assertTrue(task.complete())
+        mock_subprocess.call.assert_has_calls(
+            [
+                call(
+                    [
+                        '/bin/bash',
+                        '-cx',
+                        'mkdir -p /var/seqr/seqr-hail-search-data/GRCh38/SNV_INDEL && gsutil -qm rsync -rd -x .*runs.* gs://test-hail-search-dir/GRCh38/SNV_INDEL /var/seqr/seqr-hail-search-data/GRCh38/SNV_INDEL && sync /var/seqr/seqr-hail-search-data/GRCh38/SNV_INDEL',
+                    ],
+                ),
+                call(
+                    [
+                        '/bin/bash',
+                        '-cx',
+                        'mkdir -p /var/seqr/seqr-reference-data/GRCh38/SNV_INDEL/high_af_variants.ht && gsutil -qm rsync -rd -x .*runs.* gs://test-reference-data-dir/GRCh38/SNV_INDEL/high_af_variants.ht /var/seqr/seqr-reference-data/GRCh38/SNV_INDEL/high_af_variants.ht && sync /var/seqr/seqr-reference-data/GRCh38/SNV_INDEL/high_af_variants.ht',
+                    ],
+                ),
+                call(
+                    [
+                        '/bin/bash',
+                        '-cx',
+                        'mkdir -p /var/seqr/seqr-reference-data/GRCh38/SNV_INDEL/clinvar_path_variants.ht && gsutil -qm rsync -rd -x .*runs.* gs://test-reference-data-dir/GRCh38/SNV_INDEL/clinvar_path_variants.ht /var/seqr/seqr-reference-data/GRCh38/SNV_INDEL/clinvar_path_variants.ht && sync /var/seqr/seqr-reference-data/GRCh38/SNV_INDEL/clinvar_path_variants.ht',
+                    ],
+                ),
+            ],
+            any_order=True,
+        )

From c56aa8f8b0335a7b6f81a327faaac83feca8cc0b Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <b.p.blankenmeister@gmail.com>
Date: Fri, 10 Jan 2025 15:45:16 -0500
Subject: [PATCH 09/10] refactor + chore: shuffle end of pipeline steps to
 enable dataproc for local users (#1009)

* Support gcs dirs in rsync

* ws

* Add create dataproc cluster task

* add dataproc

* ruff

* requirements

* still struggling

* Gencode refactor to remove gcs

* bump reqs

* Run dataproc job

* lib

* running

* merge requirements

* Flip'em

* Better exception handling

* Cleaner approach if less generalizable

* write a test

* Fix tests

* lint

* Add test for success

* refactor to use a base class... better for adding support for multiple jobs

* cleanup

* ruff

* Fix missing mock

* Fix flapping test

* first commit

* Finish test and cleanup

* Allow any order

* First commit

* ruff

* ruff

* finish off

* A few minor tweaks
---
 v03_pipeline/lib/model/feature_flag.py        |  2 ++
 v03_pipeline/lib/tasks/__init__.py            |  2 ++
 .../dataproc/base_run_job_on_dataproc.py      | 10 +++---
 v03_pipeline/lib/tasks/dataproc/misc_test.py  |  6 ++--
 .../tasks/dataproc/rsync_to_seqr_app_dirs.py  |  6 ++++
 .../dataproc/rsync_to_seqr_app_dirs_test.py   | 11 +++++++
 .../dataproc/run_pipeline_on_dataproc.py      | 16 ++++++++++
 ...st.py => run_pipeline_on_dataproc_test.py} | 16 +++++-----
 .../write_success_file_on_dataproc.py         | 22 -------------
 v03_pipeline/lib/tasks/run_pipeline.py        | 32 +++++++++++++++++++
 v03_pipeline/lib/tasks/write_success_file.py  | 31 +++++++-----------
 .../lib/tasks/write_success_file_test.py      | 17 ++--------
 12 files changed, 99 insertions(+), 72 deletions(-)
 create mode 100644 v03_pipeline/lib/tasks/dataproc/run_pipeline_on_dataproc.py
 rename v03_pipeline/lib/tasks/dataproc/{write_success_file_on_dataproc_test.py => run_pipeline_on_dataproc_test.py} (91%)
 delete mode 100644 v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py
 create mode 100644 v03_pipeline/lib/tasks/run_pipeline.py

diff --git a/v03_pipeline/lib/model/feature_flag.py b/v03_pipeline/lib/model/feature_flag.py
index 0b57e8643..8a033f60f 100644
--- a/v03_pipeline/lib/model/feature_flag.py
+++ b/v03_pipeline/lib/model/feature_flag.py
@@ -11,6 +11,7 @@
 INCLUDE_PIPELINE_VERSION_IN_PREFIX = (
     os.environ.get('INCLUDE_PIPELINE_VERSION_IN_PREFIX') == '1'
 )
+RUN_PIPELINE_ON_DATAPROC = os.environ.get('RUN_PIPELINE_ON_DATAPROC') == '1'
 SHOULD_TRIGGER_HAIL_BACKEND_RELOAD = (
     os.environ.get('SHOULD_TRIGGER_HAIL_BACKEND_RELOAD') == '1'
 )
@@ -23,4 +24,5 @@ class FeatureFlag:
     EXPECT_TDR_METRICS: bool = EXPECT_TDR_METRICS
     EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS
     INCLUDE_PIPELINE_VERSION_IN_PREFIX: bool = INCLUDE_PIPELINE_VERSION_IN_PREFIX
+    RUN_PIPELINE_ON_DATAPROC: bool = RUN_PIPELINE_ON_DATAPROC
     SHOULD_TRIGGER_HAIL_BACKEND_RELOAD: bool = SHOULD_TRIGGER_HAIL_BACKEND_RELOAD
diff --git a/v03_pipeline/lib/tasks/__init__.py b/v03_pipeline/lib/tasks/__init__.py
index 1b986f668..76832c5c5 100644
--- a/v03_pipeline/lib/tasks/__init__.py
+++ b/v03_pipeline/lib/tasks/__init__.py
@@ -11,6 +11,7 @@
 from v03_pipeline.lib.tasks.reference_data.update_variant_annotations_table_with_updated_reference_dataset import (
     UpdateVariantAnnotationsTableWithUpdatedReferenceDataset,
 )
+from v03_pipeline.lib.tasks.run_pipeline import RunPipelineTask
 from v03_pipeline.lib.tasks.update_lookup_table import (
     UpdateLookupTableTask,
 )
@@ -46,6 +47,7 @@
     'DeleteProjectTablesTask',
     'MigrateAllLookupTablesTask',
     'MigrateAllVariantAnnotationsTablesTask',
+    'RunPipelineTask',
     'UpdateProjectTableTask',
     'UpdateProjectTablesWithDeletedFamiliesTask',
     'UpdateLookupTableTask',
diff --git a/v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py b/v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py
index 1094d651e..b8162392e 100644
--- a/v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py
+++ b/v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py
@@ -33,12 +33,12 @@ def __init__(self, *args, **kwargs):
         )
 
     @property
-    def task_name(self):
-        return self.get_task_family().split('.')[-1]
+    def task(self):
+        raise NotImplementedError
 
     @property
     def job_id(self):
-        return f'{self.task_name}-{self.run_id}'
+        return f'{self.task.task_family}-{self.run_id}'
 
     def requires(self) -> [luigi.Task]:
         return [self.clone(CreateDataprocClusterTask)]
@@ -58,7 +58,7 @@ def complete(self) -> bool:
         except google.api_core.exceptions.NotFound:
             return False
         if job.status.state == ERROR_STATE:
-            msg = f'Job {self.task_name}-{self.run_id} entered ERROR state'
+            msg = f'Job {self.task.task_family}-{self.run_id} entered ERROR state'
             logger.error(msg)
             logger.error(job.status.details)
         return job.status.state == DONE_STATE
@@ -81,7 +81,7 @@ def run(self):
                     'pyspark_job': {
                         'main_python_file_uri': f'{SEQR_PIPELINE_RUNNER_BUILD}/bin/run_task.py',
                         'args': [
-                            self.task_name,
+                            self.task.task_family,
                             '--local-scheduler',
                             *to_kebab_str_args(self),
                         ],
diff --git a/v03_pipeline/lib/tasks/dataproc/misc_test.py b/v03_pipeline/lib/tasks/dataproc/misc_test.py
index 335cacbf7..22729bedc 100644
--- a/v03_pipeline/lib/tasks/dataproc/misc_test.py
+++ b/v03_pipeline/lib/tasks/dataproc/misc_test.py
@@ -3,8 +3,8 @@
 
 from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
 from v03_pipeline.lib.tasks.dataproc.misc import to_kebab_str_args
-from v03_pipeline.lib.tasks.dataproc.write_success_file_on_dataproc import (
-    WriteSuccessFileOnDataprocTask,
+from v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs import (
+    RsyncToSeqrAppDirsTask,
 )
 
 
@@ -13,7 +13,7 @@
 )
 class MiscTest(unittest.TestCase):
     def test_to_kebab_str_args(self, _: Mock):
-        t = WriteSuccessFileOnDataprocTask(
+        t = RsyncToSeqrAppDirsTask(
             reference_genome=ReferenceGenome.GRCh38,
             dataset_type=DatasetType.SNV_INDEL,
             sample_type=SampleType.WGS,
diff --git a/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py
index 1f5e2e48c..194f27611 100644
--- a/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py
+++ b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py
@@ -9,6 +9,9 @@
 from v03_pipeline.lib.tasks.base.base_loading_run_params import (
     BaseLoadingRunParams,
 )
+from v03_pipeline.lib.tasks.dataproc.run_pipeline_on_dataproc import (
+    RunPipelineOnDataprocTask,
+)
 
 
 def hail_search_value(value: str) -> str:
@@ -38,6 +41,9 @@ def output(self) -> None:
     def complete(self) -> bool:
         return self.done
 
+    def requires(self) -> luigi.Task:
+        return self.clone(RunPipelineOnDataprocTask)
+
     def run(self) -> None:
         if not (
             Env.SEQR_APP_HAIL_SEARCH_DATA_DIR and Env.SEQR_APP_REFERENCE_DATASETS_DIR
diff --git a/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py
index 225cfa93c..e4894e1db 100644
--- a/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py
+++ b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py
@@ -13,14 +13,20 @@
 from v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs import (
     RsyncToSeqrAppDirsTask,
 )
+from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
 
 
 class RsyncToSeqrAppDirsTaskTest(unittest.TestCase):
+    @patch(
+        'v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs.RunPipelineOnDataprocTask',
+    )
     @patch('v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs.subprocess')
     def test_rsync_to_seqr_app_dirs_no_sync(
         self,
         mock_subprocess: Mock,
+        mock_run_pipeline_task: Mock,
     ) -> None:
+        mock_run_pipeline_task.return_value = MockCompleteTask()
         worker = luigi.worker.Worker()
         task = RsyncToSeqrAppDirsTask(
             reference_genome=ReferenceGenome.GRCh38,
@@ -37,6 +43,9 @@ def test_rsync_to_seqr_app_dirs_no_sync(
         self.assertTrue(task.complete())
         mock_subprocess.call.assert_not_called()
 
+    @patch(
+        'v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs.RunPipelineOnDataprocTask',
+    )
     @patch('v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs.subprocess')
     @patch.object(Env, 'HAIL_SEARCH_DATA_DIR', 'gs://test-hail-search-dir')
     @patch.object(Env, 'REFERENCE_DATASETS_DIR', 'gs://test-reference-data-dir')
@@ -58,7 +67,9 @@ def test_rsync_to_seqr_app_dirs_no_sync(
     def test_rsync_to_seqr_app_dirs_sync(
         self,
         mock_subprocess: Mock,
+        mock_run_pipeline_task: Mock,
     ) -> None:
+        mock_run_pipeline_task.return_value = MockCompleteTask()
         worker = luigi.worker.Worker()
         task = RsyncToSeqrAppDirsTask(
             reference_genome=ReferenceGenome.GRCh38,
diff --git a/v03_pipeline/lib/tasks/dataproc/run_pipeline_on_dataproc.py b/v03_pipeline/lib/tasks/dataproc/run_pipeline_on_dataproc.py
new file mode 100644
index 000000000..15ee74fb4
--- /dev/null
+++ b/v03_pipeline/lib/tasks/dataproc/run_pipeline_on_dataproc.py
@@ -0,0 +1,16 @@
+import luigi
+
+from v03_pipeline.lib.tasks.base.base_loading_run_params import (
+    BaseLoadingRunParams,
+)
+from v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc import (
+    BaseRunJobOnDataprocTask,
+)
+from v03_pipeline.lib.tasks.run_pipeline import RunPipelineTask
+
+
+@luigi.util.inherits(BaseLoadingRunParams)
+class RunPipelineOnDataprocTask(BaseRunJobOnDataprocTask):
+    @property
+    def task(self) -> luigi.Task:
+        return RunPipelineTask
diff --git a/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc_test.py b/v03_pipeline/lib/tasks/dataproc/run_pipeline_on_dataproc_test.py
similarity index 91%
rename from v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc_test.py
rename to v03_pipeline/lib/tasks/dataproc/run_pipeline_on_dataproc_test.py
index 62cb4d678..abde6833f 100644
--- a/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc_test.py
+++ b/v03_pipeline/lib/tasks/dataproc/run_pipeline_on_dataproc_test.py
@@ -6,8 +6,8 @@
 import luigi
 
 from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
-from v03_pipeline.lib.tasks.dataproc.write_success_file_on_dataproc import (
-    WriteSuccessFileOnDataprocTask,
+from v03_pipeline.lib.tasks.dataproc.run_pipeline_on_dataproc import (
+    RunPipelineOnDataprocTask,
 )
 from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
 
@@ -38,7 +38,7 @@ def test_job_already_exists_failed(
             google.api_core.exceptions.AlreadyExists('job exists')
         )
         worker = luigi.worker.Worker()
-        task = WriteSuccessFileOnDataprocTask(
+        task = RunPipelineOnDataprocTask(
             reference_genome=ReferenceGenome.GRCh38,
             dataset_type=DatasetType.SNV_INDEL,
             sample_type=SampleType.WGS,
@@ -54,7 +54,7 @@ def test_job_already_exists_failed(
         mock_logger.error.assert_has_calls(
             [
                 call(
-                    'Job WriteSuccessFileOnDataprocTask-manual__2024-04-03 entered ERROR state',
+                    'Job RunPipelineTask-manual__2024-04-03 entered ERROR state',
                 ),
             ],
         )
@@ -70,7 +70,7 @@ def test_job_already_exists_success(
             status=SimpleNamespace(state='DONE'),
         )
         worker = luigi.worker.Worker()
-        task = WriteSuccessFileOnDataprocTask(
+        task = RunPipelineOnDataprocTask(
             reference_genome=ReferenceGenome.GRCh38,
             dataset_type=DatasetType.SNV_INDEL,
             sample_type=SampleType.WGS,
@@ -102,7 +102,7 @@ def test_job_failed(
             'FailedPrecondition: 400 Job failed with message',
         )
         worker = luigi.worker.Worker()
-        task = WriteSuccessFileOnDataprocTask(
+        task = RunPipelineOnDataprocTask(
             reference_genome=ReferenceGenome.GRCh38,
             dataset_type=DatasetType.SNV_INDEL,
             sample_type=SampleType.WGS,
@@ -118,7 +118,7 @@ def test_job_failed(
         mock_logger.info.assert_has_calls(
             [
                 call(
-                    'Waiting for job completion WriteSuccessFileOnDataprocTask-manual__2024-04-05',
+                    'Waiting for job completion RunPipelineTask-manual__2024-04-05',
                 ),
             ],
         )
@@ -141,7 +141,7 @@ def test_job_success(
         operation = mock_client.submit_job_as_operation.return_value
         operation.done.side_effect = [False, True]
         worker = luigi.worker.Worker()
-        task = WriteSuccessFileOnDataprocTask(
+        task = RunPipelineOnDataprocTask(
             reference_genome=ReferenceGenome.GRCh38,
             dataset_type=DatasetType.SNV_INDEL,
             sample_type=SampleType.WGS,
diff --git a/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py b/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py
deleted file mode 100644
index 0c48c344f..000000000
--- a/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import luigi
-
-from v03_pipeline.lib.paths import pipeline_run_success_file_path
-from v03_pipeline.lib.tasks.base.base_loading_run_params import (
-    BaseLoadingRunParams,
-)
-from v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc import (
-    BaseRunJobOnDataprocTask,
-)
-from v03_pipeline.lib.tasks.files import GCSorLocalTarget
-
-
-@luigi.util.inherits(BaseLoadingRunParams)
-class WriteSuccessFileOnDataprocTask(BaseRunJobOnDataprocTask):
-    def output(self) -> luigi.Target:
-        return GCSorLocalTarget(
-            pipeline_run_success_file_path(
-                self.reference_genome,
-                self.dataset_type,
-                self.run_id,
-            ),
-        )
diff --git a/v03_pipeline/lib/tasks/run_pipeline.py b/v03_pipeline/lib/tasks/run_pipeline.py
new file mode 100644
index 000000000..1711143d2
--- /dev/null
+++ b/v03_pipeline/lib/tasks/run_pipeline.py
@@ -0,0 +1,32 @@
+import luigi
+import luigi.util
+
+from v03_pipeline.lib.tasks.base.base_loading_run_params import (
+    BaseLoadingRunParams,
+)
+from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import (
+    UpdateVariantAnnotationsTableWithNewSamplesTask,
+)
+from v03_pipeline.lib.tasks.write_metadata_for_run import WriteMetadataForRunTask
+from v03_pipeline.lib.tasks.write_project_family_tables import (
+    WriteProjectFamilyTablesTask,
+)
+
+
+@luigi.util.inherits(BaseLoadingRunParams)
+class RunPipelineTask(luigi.WrapperTask):
+    def requires(self):
+        requirements = [
+            self.clone(WriteMetadataForRunTask),
+            self.clone(UpdateVariantAnnotationsTableWithNewSamplesTask),
+        ]
+        return [
+            *requirements,
+            *[
+                self.clone(
+                    WriteProjectFamilyTablesTask,
+                    project_i=i,
+                )
+                for i in range(len(self.project_guids))
+            ],
+        ]
diff --git a/v03_pipeline/lib/tasks/write_success_file.py b/v03_pipeline/lib/tasks/write_success_file.py
index 60c90f81d..2428c6ceb 100644
--- a/v03_pipeline/lib/tasks/write_success_file.py
+++ b/v03_pipeline/lib/tasks/write_success_file.py
@@ -1,16 +1,16 @@
 import luigi
 import luigi.util
 
+from v03_pipeline.lib.model.feature_flag import FeatureFlag
 from v03_pipeline.lib.paths import pipeline_run_success_file_path
-from v03_pipeline.lib.tasks import WriteProjectFamilyTablesTask
 from v03_pipeline.lib.tasks.base.base_loading_run_params import (
     BaseLoadingRunParams,
 )
-from v03_pipeline.lib.tasks.files import GCSorLocalTarget
-from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import (
-    UpdateVariantAnnotationsTableWithNewSamplesTask,
+from v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs import (
+    RsyncToSeqrAppDirsTask,
 )
-from v03_pipeline.lib.tasks.write_metadata_for_run import WriteMetadataForRunTask
+from v03_pipeline.lib.tasks.files import GCSorLocalTarget
+from v03_pipeline.lib.tasks.run_pipeline import RunPipelineTask
 
 
 @luigi.util.inherits(BaseLoadingRunParams)
@@ -24,21 +24,12 @@ def output(self) -> luigi.Target:
             ),
         )
 
-    def requires(self):
-        requirements = [
-            self.clone(WriteMetadataForRunTask),
-            self.clone(UpdateVariantAnnotationsTableWithNewSamplesTask),
-        ]
-        return [
-            *requirements,
-            *[
-                self.clone(
-                    WriteProjectFamilyTablesTask,
-                    project_i=i,
-                )
-                for i in range(len(self.project_guids))
-            ],
-        ]
+    def requires(self) -> luigi.Task:
+        return (
+            self.clone(RsyncToSeqrAppDirsTask)
+            if FeatureFlag.RUN_PIPELINE_ON_DATAPROC
+            else self.clone(RunPipelineTask)
+        )
 
     def run(self):
         with self.output().open('w') as f:
diff --git a/v03_pipeline/lib/tasks/write_success_file_test.py b/v03_pipeline/lib/tasks/write_success_file_test.py
index 18df69e86..4949481a6 100644
--- a/v03_pipeline/lib/tasks/write_success_file_test.py
+++ b/v03_pipeline/lib/tasks/write_success_file_test.py
@@ -10,24 +10,13 @@
 
 class WriteSuccessFileTaskTest(MockedDatarootTestCase):
     @mock.patch(
-        'v03_pipeline.lib.tasks.write_success_file.WriteMetadataForRunTask',
-    )
-    @mock.patch(
-        'v03_pipeline.lib.tasks.write_success_file.WriteProjectFamilyTablesTask',
-    )
-    @mock.patch(
-        'v03_pipeline.lib.tasks.write_success_file.UpdateVariantAnnotationsTableWithNewSamplesTask',
+        'v03_pipeline.lib.tasks.write_success_file.RunPipelineTask',
     )
     def test_write_success_file_task(
         self,
-        mock_update_variant_annotations_task,
-        mock_write_project_fam_tables,
-        mock_write_metadata_for_run_task,
+        mock_run_pipeline_task: mock.Mock,
     ) -> None:
-        mock_write_metadata_for_run_task.return_value = MockCompleteTask()
-        mock_update_variant_annotations_task.return_value = MockCompleteTask()
-        mock_write_project_fam_tables.return_value = MockCompleteTask()
-
+        mock_run_pipeline_task.return_value = MockCompleteTask()
         worker = luigi.worker.Worker()
         write_success_file = WriteSuccessFileTask(
             reference_genome=ReferenceGenome.GRCh38,

From b9df3b68734a679839f3965a96a681d7a49d4352 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <b.p.blankenmeister@gmail.com>
Date: Mon, 13 Jan 2025 15:40:15 -0500
Subject: [PATCH 10/10] feat: Misaligned ploidy samples (#1013)

* Misaligned ploidy samples

* regex

* Make sure s is a key in tests, update validation errors to support body

* unnecessary key

* Update v03_pipeline/lib/misc/validation.py

Co-authored-by: Daniel Marten <78616802+matren395@users.noreply.github.com>

* Update validation_test.py

* validation

* fix test

---------

Co-authored-by: Daniel Marten <78616802+matren395@users.noreply.github.com>
---
 v03_pipeline/lib/misc/validation.py           |  14 +-
 v03_pipeline/lib/misc/validation_test.py      | 418 ++++++++++--------
 .../lib/tasks/validate_callset_test.py        |   1 -
 .../tasks/write_validation_errors_for_run.py  |   8 +-
 4 files changed, 237 insertions(+), 204 deletions(-)

diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py
index 113e79b53..5d23397be 100644
--- a/v03_pipeline/lib/misc/validation.py
+++ b/v03_pipeline/lib/misc/validation.py
@@ -151,10 +151,16 @@ def validate_imputed_sex_ploidy(
             )
         ),
     )
-    discrepant_rate = mt.aggregate_cols(hl.agg.fraction(mt.discrepant))
-    if discrepant_rate:
-        msg = f'{discrepant_rate:.2%} of samples have misaligned ploidy with their provided imputed sex.'
-        raise SeqrValidationError(msg)
+    discrepant_samples = mt.aggregate_cols(
+        hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)),
+    )
+    if discrepant_samples:
+        sorted_discrepant_samples = sorted(discrepant_samples)
+        msg = f'Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : {sorted_discrepant_samples[:10]}'
+        raise SeqrValidationError(
+            msg,
+            {'imputed_sex_ploidy_failures': sorted_discrepant_samples},
+        )
 
 
 def validate_sample_type(
diff --git a/v03_pipeline/lib/misc/validation_test.py b/v03_pipeline/lib/misc/validation_test.py
index f32900a99..b1057e1e2 100644
--- a/v03_pipeline/lib/misc/validation_test.py
+++ b/v03_pipeline/lib/misc/validation_test.py
@@ -18,60 +18,68 @@
 
 
 def _mt_from_contigs(contigs):
-    return hl.MatrixTable.from_parts(
-        rows={
-            'locus': [
-                hl.Locus(
-                    contig=contig,
-                    position=1,
-                    reference_genome='GRCh38',
-                )
-                for contig in contigs
-            ],
-        },
-        cols={'s': ['sample_1']},
-        entries={'HL': [[0.0] for _ in range(len(contigs))]},
-    ).key_rows_by('locus')
-
-
-class ValidationTest(unittest.TestCase):
-    def test_validate_allele_type(self) -> None:
-        mt = hl.MatrixTable.from_parts(
+    return (
+        hl.MatrixTable.from_parts(
             rows={
                 'locus': [
                     hl.Locus(
-                        contig='chr1',
+                        contig=contig,
                         position=1,
                         reference_genome='GRCh38',
-                    ),
-                    hl.Locus(
-                        contig='chr1',
-                        position=2,
-                        reference_genome='GRCh38',
-                    ),
-                    hl.Locus(
-                        contig='chr1',
-                        position=3,
-                        reference_genome='GRCh38',
-                    ),
-                    hl.Locus(
-                        contig='chr1',
-                        position=4,
-                        reference_genome='GRCh38',
-                    ),
-                ],
-                'alleles': [
-                    ['A', 'T'],
-                    # NB: star alleles should pass through this validation just fine,
-                    # but are eventually filtered out upstream.
-                    ['A', '*'],
-                    ['A', '-'],
-                    ['A', '<NON_REF>'],
+                    )
+                    for contig in contigs
                 ],
             },
             cols={'s': ['sample_1']},
-            entries={'HL': [[0.0], [0.0], [0.0], [0.0]]},
-        ).key_rows_by('locus', 'alleles')
+            entries={'HL': [[0.0] for _ in range(len(contigs))]},
+        )
+        .key_rows_by('locus')
+        .key_cols_by('s')
+    )
+
+
+class ValidationTest(unittest.TestCase):
+    def test_validate_allele_type(self) -> None:
+        mt = (
+            hl.MatrixTable.from_parts(
+                rows={
+                    'locus': [
+                        hl.Locus(
+                            contig='chr1',
+                            position=1,
+                            reference_genome='GRCh38',
+                        ),
+                        hl.Locus(
+                            contig='chr1',
+                            position=2,
+                            reference_genome='GRCh38',
+                        ),
+                        hl.Locus(
+                            contig='chr1',
+                            position=3,
+                            reference_genome='GRCh38',
+                        ),
+                        hl.Locus(
+                            contig='chr1',
+                            position=4,
+                            reference_genome='GRCh38',
+                        ),
+                    ],
+                    'alleles': [
+                        ['A', 'T'],
+                        # NB: star alleles should pass through this validation just fine,
+                        # but are eventually filtered out upstream.
+                        ['A', '*'],
+                        ['A', '-'],
+                        ['A', '<NON_REF>'],
+                    ],
+                },
+                cols={'s': ['sample_1']},
+                entries={'HL': [[0.0], [0.0], [0.0], [0.0]]},
+            )
+            .key_rows_by('locus', 'alleles')
+            .key_cols_by('s')
+        )
         self.assertRaisesRegex(
             SeqrValidationError,
             "Alleles with invalid AlleleType are present in the callset: \\[\\('A', '-'\\), \\('A', '<NON_REF>'\\)\\]",
@@ -80,28 +88,32 @@ def test_validate_allele_type(self) -> None:
             DatasetType.SNV_INDEL,
         )
 
-        mt = hl.MatrixTable.from_parts(
-            rows={
-                'locus': [
-                    hl.Locus(
-                        contig='chr1',
-                        position=1,
-                        reference_genome='GRCh38',
-                    ),
-                    hl.Locus(
-                        contig='chr1',
-                        position=2,
-                        reference_genome='GRCh38',
-                    ),
-                ],
-                'alleles': [
-                    ['C', '<NON_REF>'],
-                    ['A', '<NON_REF>'],
-                ],
-            },
-            cols={'s': ['sample_1']},
-            entries={'HL': [[0.0], [0.0]]},
-        ).key_rows_by('locus', 'alleles')
+        mt = (
+            hl.MatrixTable.from_parts(
+                rows={
+                    'locus': [
+                        hl.Locus(
+                            contig='chr1',
+                            position=1,
+                            reference_genome='GRCh38',
+                        ),
+                        hl.Locus(
+                            contig='chr1',
+                            position=2,
+                            reference_genome='GRCh38',
+                        ),
+                    ],
+                    'alleles': [
+                        ['C', '<NON_REF>'],
+                        ['A', '<NON_REF>'],
+                    ],
+                },
+                cols={'s': ['sample_1']},
+                entries={'HL': [[0.0], [0.0]]},
+            )
+            .key_rows_by('locus', 'alleles')
+            .key_cols_by('s')
+        )
         self.assertRaisesRegex(
             SeqrValidationError,
             'Alleles with invalid allele <NON_REF> are present in the callset.  This appears to be a GVCF containing records for sites with no variants.',
@@ -122,116 +134,128 @@ def test_validate_imputed_sex_ploidy(self) -> None:
         sex_check_ht = hl.read_table(TEST_SEX_CHECK_1)
 
         # All calls on X chromosome are valid
-        mt = hl.MatrixTable.from_parts(
-            rows={
-                'locus': [
-                    hl.Locus(
-                        contig='chrX',
-                        position=1,
-                        reference_genome='GRCh38',
-                    ),
-                ],
-            },
-            cols={
-                's': [
-                    female_sample,
-                    male_sample_1,
-                    x0_sample,
-                    xxy_sample,
-                    xyy_sample,
-                    xxx_sample,
-                ],
-            },
-            entries={
-                'GT': [
-                    [
-                        hl.Call(alleles=[0, 0], phased=False),
-                        hl.Call(alleles=[0], phased=False),
-                        hl.Call(alleles=[0, 0], phased=False),  # X0
-                        hl.Call(alleles=[0, 0], phased=False),  # XXY
-                        hl.Call(alleles=[0, 0], phased=False),  # XYY
-                        hl.Call(alleles=[0, 0], phased=False),  # XXX
+        mt = (
+            hl.MatrixTable.from_parts(
+                rows={
+                    'locus': [
+                        hl.Locus(
+                            contig='chrX',
+                            position=1,
+                            reference_genome='GRCh38',
+                        ),
                     ],
-                ],
-            },
-        ).key_rows_by('locus')
+                },
+                cols={
+                    's': [
+                        female_sample,
+                        male_sample_1,
+                        x0_sample,
+                        xxy_sample,
+                        xyy_sample,
+                        xxx_sample,
+                    ],
+                },
+                entries={
+                    'GT': [
+                        [
+                            hl.Call(alleles=[0, 0], phased=False),
+                            hl.Call(alleles=[0], phased=False),
+                            hl.Call(alleles=[0, 0], phased=False),  # X0
+                            hl.Call(alleles=[0, 0], phased=False),  # XXY
+                            hl.Call(alleles=[0, 0], phased=False),  # XYY
+                            hl.Call(alleles=[0, 0], phased=False),  # XXX
+                        ],
+                    ],
+                },
+            )
+            .key_rows_by('locus')
+            .key_cols_by('s')
+        )
         validate_imputed_sex_ploidy(mt, sex_check_ht)
 
         # All calls on Y chromosome are valid
-        mt = hl.MatrixTable.from_parts(
-            rows={
-                'locus': [
-                    hl.Locus(
-                        contig='chrY',
-                        position=1,
-                        reference_genome='GRCh38',
-                    ),
-                ],
-            },
-            cols={
-                's': [
-                    female_sample,
-                    male_sample_1,
-                    x0_sample,
-                    xxy_sample,
-                    xyy_sample,
-                    xxx_sample,
-                ],
-            },
-            entries={
-                'GT': [
-                    [
-                        hl.missing(hl.tcall),
-                        hl.Call(alleles=[0], phased=False),
-                        hl.missing(hl.tcall),  # X0
-                        hl.Call(alleles=[0, 0], phased=False),  # XXY
-                        hl.Call(alleles=[0, 0], phased=False),  # XYY
-                        hl.missing(hl.tcall),  # XXX
+        mt = (
+            hl.MatrixTable.from_parts(
+                rows={
+                    'locus': [
+                        hl.Locus(
+                            contig='chrY',
+                            position=1,
+                            reference_genome='GRCh38',
+                        ),
                     ],
-                ],
-            },
-        ).key_rows_by('locus')
+                },
+                cols={
+                    's': [
+                        female_sample,
+                        male_sample_1,
+                        x0_sample,
+                        xxy_sample,
+                        xyy_sample,
+                        xxx_sample,
+                    ],
+                },
+                entries={
+                    'GT': [
+                        [
+                            hl.missing(hl.tcall),
+                            hl.Call(alleles=[0], phased=False),
+                            hl.missing(hl.tcall),  # X0
+                            hl.Call(alleles=[0, 0], phased=False),  # XXY
+                            hl.Call(alleles=[0, 0], phased=False),  # XYY
+                            hl.missing(hl.tcall),  # XXX
+                        ],
+                    ],
+                },
+            )
+            .key_rows_by('locus')
+            .key_cols_by('s')
+        )
         validate_imputed_sex_ploidy(mt, sex_check_ht)
 
         # Invalid X chromosome case
-        mt = hl.MatrixTable.from_parts(
-            rows={
-                'locus': [
-                    hl.Locus(
-                        contig='chrX',
-                        position=1,
-                        reference_genome='GRCh38',
-                    ),
-                ],
-            },
-            cols={
-                's': [
-                    female_sample,
-                    male_sample_1,
-                    male_sample_2,
-                    x0_sample,
-                    xxy_sample,
-                    xyy_sample,
-                    xxx_sample,
-                ],
-            },
-            entries={
-                'GT': [
-                    [
-                        hl.Call(alleles=[0], phased=False),  # invalid Female call
-                        hl.Call(alleles=[0], phased=False),  # valid Male call
-                        hl.missing(hl.tcall),  # invalid Male call
-                        hl.Call(alleles=[0], phased=False),  # invalid X0 call
-                        hl.Call(alleles=[0], phased=False),  # invalid XXY call
-                        hl.missing(hl.tcall),  # valid XYY call
-                        hl.Call(alleles=[0, 0], phased=False),  # valid XXX call
+        mt = (
+            hl.MatrixTable.from_parts(
+                rows={
+                    'locus': [
+                        hl.Locus(
+                            contig='chrX',
+                            position=1,
+                            reference_genome='GRCh38',
+                        ),
                     ],
-                ],
-            },
-        ).key_rows_by('locus')
+                },
+                cols={
+                    's': [
+                        female_sample,
+                        male_sample_1,
+                        male_sample_2,
+                        x0_sample,
+                        xxy_sample,
+                        xyy_sample,
+                        xxx_sample,
+                    ],
+                },
+                entries={
+                    'GT': [
+                        [
+                            hl.Call(alleles=[0], phased=False),  # invalid Female call
+                            hl.Call(alleles=[0], phased=False),  # valid Male call
+                            hl.missing(hl.tcall),  # invalid Male call
+                            hl.Call(alleles=[0], phased=False),  # invalid X0 call
+                            hl.Call(alleles=[0], phased=False),  # invalid XXY call
+                            hl.missing(hl.tcall),  # valid XYY call
+                            hl.Call(alleles=[0, 0], phased=False),  # valid XXX call
+                        ],
+                    ],
+                },
+            )
+            .key_rows_by('locus')
+            .key_cols_by('s')
+        )
         self.assertRaisesRegex(
             SeqrValidationError,
-            '57.14% of samples have misaligned ploidy',
+            "Found samples with misaligned ploidy with their provided imputed sex \\(first 10, if applicable\\) : \\['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1'\\].*",
             validate_imputed_sex_ploidy,
             mt,
             sex_check_ht,
@@ -253,34 +277,38 @@ def test_validate_imported_field_types(self) -> None:
         )
 
     def test_validate_no_duplicate_variants(self) -> None:
-        mt = hl.MatrixTable.from_parts(
-            rows={
-                'locus': [
-                    hl.Locus(
-                        contig='chr1',
-                        position=1,
-                        reference_genome='GRCh38',
-                    ),
-                    hl.Locus(
-                        contig='chr1',
-                        position=2,
-                        reference_genome='GRCh38',
-                    ),
-                    hl.Locus(
-                        contig='chr1',
-                        position=2,
-                        reference_genome='GRCh38',
-                    ),
-                ],
-                'alleles': [
-                    ['A', 'C'],
-                    ['A', 'C'],
-                    ['A', 'C'],
-                ],
-            },
-            cols={'s': ['sample_1']},
-            entries={'HL': [[0.0], [0.0], [0.0]]},
-        ).key_rows_by('locus', 'alleles')
+        mt = (
+            hl.MatrixTable.from_parts(
+                rows={
+                    'locus': [
+                        hl.Locus(
+                            contig='chr1',
+                            position=1,
+                            reference_genome='GRCh38',
+                        ),
+                        hl.Locus(
+                            contig='chr1',
+                            position=2,
+                            reference_genome='GRCh38',
+                        ),
+                        hl.Locus(
+                            contig='chr1',
+                            position=2,
+                            reference_genome='GRCh38',
+                        ),
+                    ],
+                    'alleles': [
+                        ['A', 'C'],
+                        ['A', 'C'],
+                        ['A', 'C'],
+                    ],
+                },
+                cols={'s': ['sample_1']},
+                entries={'HL': [[0.0], [0.0], [0.0]]},
+            )
+            .key_rows_by('locus', 'alleles')
+            .key_cols_by('s')
+        )
         self.assertRaisesRegex(
             SeqrValidationError,
             "Variants are present multiple times in the callset: \\['1-2-A-C'\\]",
diff --git a/v03_pipeline/lib/tasks/validate_callset_test.py b/v03_pipeline/lib/tasks/validate_callset_test.py
index a6d2b0377..8f3638376 100644
--- a/v03_pipeline/lib/tasks/validate_callset_test.py
+++ b/v03_pipeline/lib/tasks/validate_callset_test.py
@@ -83,6 +83,5 @@ def test_validate_callset_multiple_exceptions(
                         'Missing the following expected contigs:chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr20, chr21, chr22, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX',
                         'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants',
                     ],
-                    'failed_family_samples': {},
                 },
             )
diff --git a/v03_pipeline/lib/tasks/write_validation_errors_for_run.py b/v03_pipeline/lib/tasks/write_validation_errors_for_run.py
index d99800f6f..c59fce7e0 100644
--- a/v03_pipeline/lib/tasks/write_validation_errors_for_run.py
+++ b/v03_pipeline/lib/tasks/write_validation_errors_for_run.py
@@ -15,7 +15,7 @@
 class WriteValidationErrorsForRunTask(luigi.Task):
     project_guids = luigi.ListParameter()
     error_messages = luigi.ListParameter(default=[])
-    failed_family_samples = luigi.DictParameter(default={})
+    error_body = luigi.DictParameter(default={})
 
     def to_single_error_message(self) -> str:
         with self.output().open('r') as f:
@@ -37,8 +37,8 @@ def run(self) -> None:
         validation_errors_json = {
             'project_guids': self.project_guids,
             'error_messages': self.error_messages,
-            'failed_family_samples': luigi.freezing.recursively_unfreeze(
-                self.failed_family_samples,
+            **luigi.freezing.recursively_unfreeze(
+                self.error_body,
             ),
         }
         with self.output().open('w') as f:
@@ -57,7 +57,7 @@ def wrapper(self: luigi.Task):
                 write_validation_errors_for_run_task = self.clone(
                     WriteValidationErrorsForRunTask,
                     error_messages=[str(e.args[0])],
-                    failed_family_samples=e.args[1]['failed_family_samples'],
+                    error_body=e.args[1],
                 )
             else:
                 write_validation_errors_for_run_task = self.clone(