From 533d1d4569bc69a958d6d7ec489aa8a36a9c1279 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 13 Dec 2024 09:41:27 -0500 Subject: [PATCH 01/10] Add service account credentialing (#997) * Add service account credentialing * ruff --- v03_pipeline/lib/misc/gcp.py | 36 +++++++++++++++++++ .../tasks/dataproc/create_dataproc_cluster.py | 5 +++ .../dataproc/create_dataproc_cluster_test.py | 18 +++++++++- 3 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 v03_pipeline/lib/misc/gcp.py diff --git a/v03_pipeline/lib/misc/gcp.py b/v03_pipeline/lib/misc/gcp.py new file mode 100644 index 000000000..c22113bf1 --- /dev/null +++ b/v03_pipeline/lib/misc/gcp.py @@ -0,0 +1,36 @@ +import datetime + +import google.auth +import google.auth.transport.requests +import google.oauth2.credentials +import pytz + +SERVICE_ACCOUNT_CREDENTIALS = None +SOCIAL_AUTH_GOOGLE_OAUTH2_SCOPE = [ + 'https://www.googleapis.com/auth/userinfo.profile', + 'https://www.googleapis.com/auth/userinfo.email', + 'openid', +] +ONE_MINUTE_S = 60 + + +def get_service_account_credentials() -> google.oauth2.credentials.Credentials: + global SERVICE_ACCOUNT_CREDENTIALS + if not SERVICE_ACCOUNT_CREDENTIALS: + SERVICE_ACCOUNT_CREDENTIALS, _ = google.auth.default( + scopes=SOCIAL_AUTH_GOOGLE_OAUTH2_SCOPE, + ) + tz = pytz.UTC + if ( + SERVICE_ACCOUNT_CREDENTIALS.token + and ( + tz.localize(SERVICE_ACCOUNT_CREDENTIALS.expiry) + - datetime.datetime.now(tz=tz) + ).total_seconds() + > ONE_MINUTE_S + ): + return SERVICE_ACCOUNT_CREDENTIALS + SERVICE_ACCOUNT_CREDENTIALS.refresh( + request=google.auth.transport.requests.Request(), + ) + return SERVICE_ACCOUNT_CREDENTIALS diff --git a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py index 3a1d6c9da..f7d0bc21d 100644 --- a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py +++ b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py @@ -6,6 +6,7 @@ from pip._internal.operations import freeze as pip_freeze from v03_pipeline.lib.logger import get_logger +from v03_pipeline.lib.misc.gcp import get_service_account_credentials from v03_pipeline.lib.model import Env, ReferenceGenome from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import ( BaseLoadingPipelineParams, @@ -22,9 +23,11 @@ def get_cluster_config(reference_genome: ReferenceGenome, run_id: str): + service_account_credentials = get_service_account_credentials() return { 'project_id': Env.GCLOUD_PROJECT, 'cluster_name': f'{CLUSTER_NAME_PREFIX}-{reference_genome.value.lower()}-{run_id}', + # Schema found at https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig 'config': { 'gce_cluster_config': { 'zone_uri': Env.GCLOUD_ZONE, @@ -35,6 +38,8 @@ def get_cluster_config(reference_genome: ReferenceGenome, run_id: str): 'REFERENCE_GENOME': reference_genome.value, 'PIPELINE_RUNNER_APP_VERSION': Env.PIPELINE_RUNNER_APP_VERSION, }, + 'service_account': service_account_credentials.service_account_email, + 'service_account_scopes': service_account_credentials.scopes, }, 'master_config': { 'num_instances': 1, diff --git a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py index bb447e0d6..c7f4f2958 100644 --- a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py +++ b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py @@ -5,17 +5,29 @@ import google.api_core.exceptions import luigi +from v03_pipeline.lib.misc.gcp import SOCIAL_AUTH_GOOGLE_OAUTH2_SCOPE from v03_pipeline.lib.model import DatasetType, ReferenceGenome from v03_pipeline.lib.tasks.dataproc.create_dataproc_cluster import ( CreateDataprocClusterTask, ) +@patch( + 'v03_pipeline.lib.tasks.dataproc.create_dataproc_cluster.get_service_account_credentials', + return_value=SimpleNamespace( + service_account_email='test@serviceaccount.com', + scopes=SOCIAL_AUTH_GOOGLE_OAUTH2_SCOPE, + ), +) @patch( 'v03_pipeline.lib.tasks.dataproc.create_dataproc_cluster.dataproc.ClusterControllerClient', ) class CreateDataprocClusterTaskTest(unittest.TestCase): - def test_dataset_type_unsupported(self, mock_cluster_controller: Mock) -> None: + def test_dataset_type_unsupported( + self, + mock_cluster_controller: Mock, + _: Mock, + ) -> None: worker = luigi.worker.Worker() task = CreateDataprocClusterTask( reference_genome=ReferenceGenome.GRCh38, @@ -29,6 +41,7 @@ def test_dataset_type_unsupported(self, mock_cluster_controller: Mock) -> None: def test_spinup_cluster_already_exists_failed( self, mock_cluster_controller: Mock, + _: Mock, ) -> None: mock_client = mock_cluster_controller.return_value mock_client.get_cluster.return_value = SimpleNamespace( @@ -50,6 +63,7 @@ def test_spinup_cluster_already_exists_failed( def test_spinup_cluster_already_exists_success( self, mock_cluster_controller: Mock, + _: Mock, ) -> None: mock_client = mock_cluster_controller.return_value mock_client.get_cluster.return_value = SimpleNamespace( @@ -73,6 +87,7 @@ def test_spinup_cluster_doesnt_exist_failed( self, mock_logger: Mock, mock_cluster_controller: Mock, + _: Mock, ) -> None: mock_client = mock_cluster_controller.return_value mock_client.get_cluster.side_effect = google.api_core.exceptions.NotFound( @@ -98,6 +113,7 @@ def test_spinup_cluster_doesnt_exist_success( self, mock_logger: Mock, mock_cluster_controller: Mock, + _: Mock, ) -> None: mock_client = mock_cluster_controller.return_value mock_client.get_cluster.side_effect = google.api_core.exceptions.NotFound( From b98fe664c8733e8edb0faf39ff042827a362a93d Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 13 Dec 2024 11:13:20 -0500 Subject: [PATCH 02/10] feat: Handle parsing empty predicted sex into Unknown (#1000) --- v03_pipeline/lib/misc/io.py | 6 +++++- v03_pipeline/lib/misc/io_test.py | 1 + v03_pipeline/lib/model/definitions.py | 10 +++++----- v03_pipeline/var/test/sex_check/test_imputed_sex.tsv | 1 + 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/v03_pipeline/lib/misc/io.py b/v03_pipeline/lib/misc/io.py index 865cdf5b3..cc31ffba1 100644 --- a/v03_pipeline/lib/misc/io.py +++ b/v03_pipeline/lib/misc/io.py @@ -219,7 +219,11 @@ def select_relevant_fields( def import_imputed_sex(imputed_sex_path: str) -> hl.Table: ht = hl.import_table(imputed_sex_path) imputed_sex_lookup = hl.dict( - {s.imputed_sex_value: s.value for s in Sex}, + { + imputed_sex_value: s.value + for s in Sex + for imputed_sex_value in s.imputed_sex_values + }, ) ht = ht.select( s=ht.collaborator_sample_id, diff --git a/v03_pipeline/lib/misc/io_test.py b/v03_pipeline/lib/misc/io_test.py index f392c5c76..c9b4f3bce 100644 --- a/v03_pipeline/lib/misc/io_test.py +++ b/v03_pipeline/lib/misc/io_test.py @@ -47,6 +47,7 @@ def test_import_imputed_sex(self) -> None: hl.Struct(s='abc_2', predicted_sex='F'), hl.Struct(s='abc_3', predicted_sex='U'), hl.Struct(s='abc_4', predicted_sex='XYY'), + hl.Struct(s='abc_5', predicted_sex='U'), ], ) diff --git a/v03_pipeline/lib/model/definitions.py b/v03_pipeline/lib/model/definitions.py index a87437c8f..09b10fcbd 100644 --- a/v03_pipeline/lib/model/definitions.py +++ b/v03_pipeline/lib/model/definitions.py @@ -18,12 +18,12 @@ class Sex(str, Enum): XYY = 'XYY' @property - def imputed_sex_value(self): + def imputed_sex_values(self) -> list[str]: return { - Sex.MALE: 'Male', - Sex.FEMALE: 'Female', - Sex.UNKNOWN: 'Unknown', - }.get(self, self.name) + Sex.MALE: ['Male'], + Sex.FEMALE: ['Female'], + Sex.UNKNOWN: ['', 'Unknown'], + }.get(self, [self.name]) class PipelineVersion(str, Enum): diff --git a/v03_pipeline/var/test/sex_check/test_imputed_sex.tsv b/v03_pipeline/var/test/sex_check/test_imputed_sex.tsv index a22b0e26a..f0ae98e6c 100644 --- a/v03_pipeline/var/test/sex_check/test_imputed_sex.tsv +++ b/v03_pipeline/var/test/sex_check/test_imputed_sex.tsv @@ -3,3 +3,4 @@ SM-DM66X abc_1 abc_1 0E+00 gs://datarepo-9cafeffd-bucket/f511b131-3f0d-4eb7-a7f0 SM-DM69X abc_2 abc_2 0E+00 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c4c07edf-7735-4aa7-9283-7cb2607b60a2/GLE-5774-3-3.qc-coverage-region-1_coverage_metrics.csv gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/dcd4c271-0249-47f1-8e91-81f74735c5a1/GLE-5774-3-3.cram.crai gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/ec41ec06-673f-4fe2-a063-23dc5fe1dcce/GLE-5774-3-3.cram.md5sum gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/aad0e270-2ad5-4f39-b968-9b4beafeb5cc/GLE-5774-3-3.cram a4b04a39-9234-4028-a155-442c4acf12a0 07.021.604.3.7.8 ce74d94c-c33d-49d7-85c9-5f3cbd08aff7 2024-04-17T15:02:46 99.800000000 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c3a9e6f2-4c68-410b-823d-46ca406e5061/GLE-5774-3-3.mapping_metrics.csv DNA:DNA Genomic 35.300000000 Whole Blood:Whole Blood PT-24OHM Pass PDO-32755 96.320000000 97.340000000 Female P-WG-0139 2017-04-12 04:00:00 Female RP-3061 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c71cd2a1-c789-4715-9ebc-dbfc40d9f2e2/GLE-5774-3-3.vcf.gz.tbi gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/957a99cb-c9a9-4fc5-a0ec-53f9e461469e/GLE-5774-3-3.vcf.gz.md5sum gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/df520949-5f2b-4976-9d46-80d1cc299813/GLE-5774-3-3.vcf.gz 133253714921.000000000 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/2e98e51b-9394-4e64-977f-e9010a4e16dc/GLE-5774-3-3.vc_metrics.csv SM-DPB5G abc_3 abc_3 0E+00 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/432f8354-77e0-4381-9bb5-dfdc0633b5b2/PIE_OGI1433_002628_1.qc-coverage-region-1_coverage_metrics.csv gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/3dc623fa-2a45-4b3d-a0f8-fcdec09f9418/PIE_OGI1433_002628_1.cram.crai gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/895966ef-c705-4c18-952d-03863243a184/PIE_OGI1433_002628_1.cram.md5sum gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/96ca6d5f-fb23-4102-bb5e-c7bbfd194e1c/PIE_OGI1433_002628_1.cram ffb50687-165e-425a-a545-c3797d3a28d4 07.021.604.3.7.8 55729ba9-3ce4-47b3-9c3b-1148737ae40f 2024-04-17T15:07:57 99.670000000 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/30f8e208-5d2d-4ce8-b835-695b5ed673f4/PIE_OGI1433_002628_1.mapping_metrics.csv DNA:DNA Genomic 41.910000000 Whole Blood:Whole Blood PT-25BR5 Pass PDO-32756 92.920000000 97.990000000 Unknown P-WG-0139 2017-05-19 04:00:00 Unknown RP-3062 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/1641d1b2-1035-4cc3-9c8b-0c8cb430f56b/PIE_OGI1433_002628_1.vcf.gz.tbi gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/f5ba2708-899e-42e8-b287-fdf72c2e404d/PIE_OGI1433_002628_1.vcf.gz.md5sum gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/e925ee5d-a75e-471f-adfd-2756c8690069/PIE_OGI1433_002628_1.vcf.gz 156149580126.000000000 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/df076bc5-9db8-44f0-a3fe-f693370634cc/PIE_OGI1433_002628_1.vc_metrics.csv SM-DPB5G abc_4 abc_4 0E+00 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/432f8354-77e0-4381-9bb5-dfdc0633b5b2/PIE_OGI1433_002628_1.qc-coverage-region-1_coverage_metrics.csv gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/3dc623fa-2a45-4b3d-a0f8-fcdec09f9418/PIE_OGI1433_002628_1.cram.crai gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/895966ef-c705-4c18-952d-03863243a184/PIE_OGI1433_002628_1.cram.md5sum gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/96ca6d5f-fb23-4102-bb5e-c7bbfd194e1c/PIE_OGI1433_002628_1.cram ffb50687-165e-425a-a545-c3797d3a28d4 07.021.604.3.7.8 55729ba9-3ce4-47b3-9c3b-1148737ae40f 2024-04-17T15:07:57 99.670000000 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/30f8e208-5d2d-4ce8-b835-695b5ed673f4/PIE_OGI1433_002628_1.mapping_metrics.csv DNA:DNA Genomic 41.910000000 Whole Blood:Whole Blood PT-25BR5 Pass PDO-32756 92.920000000 97.990000000 XYY P-WG-0139 2017-05-19 04:00:00 XYY RP-3062 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/1641d1b2-1035-4cc3-9c8b-0c8cb430f56b/PIE_OGI1433_002628_1.vcf.gz.tbi gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/f5ba2708-899e-42e8-b287-fdf72c2e404d/PIE_OGI1433_002628_1.vcf.gz.md5sum gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/e925ee5d-a75e-471f-adfd-2756c8690069/PIE_OGI1433_002628_1.vcf.gz 156149580126.000000000 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/df076bc5-9db8-44f0-a3fe-f693370634cc/PIE_OGI1433_002628_1.vc_metrics.csv +SM-DPB5G abc_5 abc_5 0E+00 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/432f8354-77e0-4381-9bb5-dfdc0633b5b2/PIE_OGI1433_002628_1.qc-coverage-region-1_coverage_metrics.csv gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/3dc623fa-2a45-4b3d-a0f8-fcdec09f9418/PIE_OGI1433_002628_1.cram.crai gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/895966ef-c705-4c18-952d-03863243a184/PIE_OGI1433_002628_1.cram.md5sum gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/96ca6d5f-fb23-4102-bb5e-c7bbfd194e1c/PIE_OGI1433_002628_1.cram ffb50687-165e-425a-a545-c3797d3a28d4 07.021.604.3.7.8 55729ba9-3ce4-47b3-9c3b-1148737ae40f 2024-04-17T15:07:57 99.670000000 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/30f8e208-5d2d-4ce8-b835-695b5ed673f4/PIE_OGI1433_002628_1.mapping_metrics.csv DNA:DNA Genomic 41.910000000 Whole Blood:Whole Blood PT-25BR5 Pass PDO-32756 92.920000000 97.990000000 P-WG-0139 2017-05-19 04:00:00 RP-3062 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/1641d1b2-1035-4cc3-9c8b-0c8cb430f56b/PIE_OGI1433_002628_1.vcf.gz.tbi gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/f5ba2708-899e-42e8-b287-fdf72c2e404d/PIE_OGI1433_002628_1.vcf.gz.md5sum gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/e925ee5d-a75e-471f-adfd-2756c8690069/PIE_OGI1433_002628_1.vcf.gz 156149580126.000000000 gs://datarepo-c41dc160-bucket/907593be-8862-4945-9e70-f758b6448b8d/df076bc5-9db8-44f0-a3fe-f693370634cc/PIE_OGI1433_002628_1.vc_metrics.csv From 8b58c01aa462f9b6ad35bc29954f124c03c26e67 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 13 Dec 2024 11:48:37 -0500 Subject: [PATCH 03/10] Add helper functions for querying `Terra Data Repository` (#998) * Add service account credentialing * ruff * First pass * tests passing * add coverage of bigquery test * change function names * use generators everywhere * bq requirement * resolver * Update sample id name * Build Sex Check Table from TDR Metrics (#999) --- requirements.in | 1 + requirements.txt | 22 +- v03_pipeline/lib/misc/allele_registry.py | 10 +- v03_pipeline/lib/misc/requests.py | 14 + .../lib/misc/terra_data_repository.py | 68 ++++ .../lib/misc/terra_data_repository_test.py | 303 ++++++++++++++++++ v03_pipeline/lib/model/dataset_type.py | 8 + v03_pipeline/lib/model/environment.py | 2 + v03_pipeline/lib/paths.py | 17 +- v03_pipeline/lib/paths_test.py | 10 +- .../lib/tasks/base/base_loading_run_params.py | 4 + .../lib/tasks/write_imported_callset.py | 12 + .../lib/tasks/write_sex_check_table.py | 24 +- .../lib/tasks/write_sex_check_table_test.py | 96 ++++++ .../lib/tasks/write_tdr_metrics_file.py | 35 ++ .../lib/tasks/write_tdr_metrics_files.py | 28 ++ 16 files changed, 623 insertions(+), 31 deletions(-) create mode 100644 v03_pipeline/lib/misc/requests.py create mode 100644 v03_pipeline/lib/misc/terra_data_repository.py create mode 100644 v03_pipeline/lib/misc/terra_data_repository_test.py create mode 100644 v03_pipeline/lib/tasks/write_sex_check_table_test.py create mode 100644 v03_pipeline/lib/tasks/write_tdr_metrics_file.py create mode 100644 v03_pipeline/lib/tasks/write_tdr_metrics_files.py diff --git a/requirements.in b/requirements.in index 6d5abb8b9..717740a88 100644 --- a/requirements.in +++ b/requirements.in @@ -4,3 +4,4 @@ gnomad==0.6.4 aiofiles==24.1.0 pydantic==2.8.2 google-cloud-dataproc==5.14.0 +google-cloud-bigquery==3.27.0 diff --git a/requirements.txt b/requirements.txt index 9762fbca9..86e9f058b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -97,17 +97,30 @@ frozenlist==1.5.0 gnomad==0.6.4 # via -r requirements.in google-api-core[grpc]==2.22.0 - # via google-cloud-dataproc + # via + # google-cloud-bigquery + # google-cloud-core + # google-cloud-dataproc google-auth==2.35.0 # via # google-api-core # google-auth-oauthlib + # google-cloud-bigquery + # google-cloud-core # google-cloud-dataproc # hail google-auth-oauthlib==0.8.0 # via hail +google-cloud-bigquery==3.27.0 + # via -r requirements.in +google-cloud-core==2.4.1 + # via google-cloud-bigquery google-cloud-dataproc==5.14.0 # via -r requirements.in +google-crc32c==1.6.0 + # via google-resumable-media +google-resumable-media==2.7.2 + # via google-cloud-bigquery googleapis-common-protos[grpc]==1.65.0 # via # google-api-core @@ -197,6 +210,7 @@ orjson==3.10.10 packaging==24.1 # via # bokeh + # google-cloud-bigquery # plotly pandas==2.2.3 # via @@ -256,9 +270,7 @@ pygments==2.18.0 # ipython # rich pyjwt[crypto]==2.9.0 - # via - # msal - # pyjwt + # via msal pyspark==3.5.3 # via hail python-daemon==3.1.0 @@ -266,6 +278,7 @@ python-daemon==3.1.0 python-dateutil==2.9.0.post0 # via # botocore + # google-cloud-bigquery # luigi # pandas python-json-logger==2.0.7 @@ -282,6 +295,7 @@ requests==2.32.3 # via # azure-core # google-api-core + # google-cloud-bigquery # hail # msal # msrest diff --git a/v03_pipeline/lib/misc/allele_registry.py b/v03_pipeline/lib/misc/allele_registry.py index 77023a9ba..6129d7e50 100644 --- a/v03_pipeline/lib/misc/allele_registry.py +++ b/v03_pipeline/lib/misc/allele_registry.py @@ -8,9 +8,9 @@ import hailtop.fs as hfs import requests from requests import HTTPError -from requests.adapters import HTTPAdapter, Retry from v03_pipeline.lib.logger import get_logger +from v03_pipeline.lib.misc.requests import requests_retry_session from v03_pipeline.lib.model import Env, ReferenceGenome MAX_VARIANTS_PER_REQUEST = 1000000 @@ -96,13 +96,7 @@ def register_alleles( logger.info('Calling the ClinGen Allele Registry') with hfs.open(formatted_vcf_file_name, 'r') as vcf_in: data = vcf_in.read() - s = requests.Session() - retries = Retry( - total=5, - backoff_factor=1, - status_forcelist=[500, 502, 503, 504], - ) - s.mount('https://', HTTPAdapter(max_retries=retries)) + s = requests_retry_session() res = s.put( url=build_url(base_url, reference_genome), data=data, diff --git a/v03_pipeline/lib/misc/requests.py b/v03_pipeline/lib/misc/requests.py new file mode 100644 index 000000000..e0ee66f36 --- /dev/null +++ b/v03_pipeline/lib/misc/requests.py @@ -0,0 +1,14 @@ +import requests +from requests.adapters import HTTPAdapter, Retry + + +def requests_retry_session(): + s = requests.Session() + retries = Retry( + total=5, + backoff_factor=1, + status_forcelist=[500, 502, 503, 504], + ) + s.mount('http://', HTTPAdapter(max_retries=retries)) + s.mount('https://', HTTPAdapter(max_retries=retries)) + return s diff --git a/v03_pipeline/lib/misc/terra_data_repository.py b/v03_pipeline/lib/misc/terra_data_repository.py new file mode 100644 index 000000000..d55b3ff37 --- /dev/null +++ b/v03_pipeline/lib/misc/terra_data_repository.py @@ -0,0 +1,68 @@ +import os +import re +from collections.abc import Generator +from concurrent.futures import ThreadPoolExecutor, as_completed + +import google.cloud.bigquery +from google.cloud import bigquery + +from v03_pipeline.lib.misc.gcp import get_service_account_credentials +from v03_pipeline.lib.misc.requests import requests_retry_session + +BIGQUERY_METRICS = [ + 'collaborator_sample_id', + 'predicted_sex', +] +BIGQUERY_RESOURCE = 'bigquery' +TABLE_NAME_VALIDATION_REGEX = r'datarepo-\w+.datarepo_\w+' +TDR_ROOT_URL = 'https://data.terra.bio/api/repository/v1/' + + +def _tdr_request(resource: str) -> dict: + service_account_token = get_service_account_credentials().token + s = requests_retry_session() + res = s.get( + url=os.path.join(TDR_ROOT_URL, resource), + headers={'Authorization': f'Bearer {service_account_token}'}, + timeout=10, + ) + res.raise_for_status() + return res.json() + + +def _get_dataset_ids() -> list[str]: + res_body = _tdr_request('datasets') + items = res_body['items'] + for item in items: + if not any(x['cloudResource'] == BIGQUERY_RESOURCE for x in item['storage']): + # Hard failure on purpose to prompt manual investigation. + msg = 'Datasets without bigquery sources are unsupported' + raise ValueError(msg) + return [x['id'] for x in items] + + +def gen_bq_table_names() -> Generator[str]: + with ThreadPoolExecutor(max_workers=5) as executor: + futures = [ + executor.submit( + _tdr_request, + f'datasets/{dataset_id}?include=ACCESS_INFORMATION', + ) + for dataset_id in _get_dataset_ids() + ] + for future in as_completed(futures): + result = future.result() + yield f"{result['accessInformation']['bigQuery']['projectId']}.{result['accessInformation']['bigQuery']['datasetName']}" + + +def bq_metrics_query(bq_table_name: str) -> google.cloud.bigquery.table.RowIterator: + if not re.match(TABLE_NAME_VALIDATION_REGEX, bq_table_name): + msg = f'{bq_table_name} does not match expected pattern' + raise ValueError(msg) + client = bigquery.Client() + return client.query_and_wait( + f""" + SELECT {','.join(BIGQUERY_METRICS)} + FROM `{bq_table_name}.sample` + """, # noqa: S608 + ) diff --git a/v03_pipeline/lib/misc/terra_data_repository_test.py b/v03_pipeline/lib/misc/terra_data_repository_test.py new file mode 100644 index 000000000..54646e193 --- /dev/null +++ b/v03_pipeline/lib/misc/terra_data_repository_test.py @@ -0,0 +1,303 @@ +import json +import os +import unittest +from types import SimpleNamespace +from unittest.mock import Mock, patch + +import responses + +from v03_pipeline.lib.misc.terra_data_repository import ( + TDR_ROOT_URL, + _get_dataset_ids, + gen_bq_table_names, +) + +TDR_DATASETS = [ + { + 'id': '2dc51ee0-a037-499d-a915-a7c20a0b216d', + 'name': 'RP_3053', + 'description': 'TGG_Ware_DRAGEN_hg38. Dataset automatically created and linked to: RP-3053', + 'defaultProfileId': '0a164b9a-2b8b-45d2-859e-a4e369b9cb4f', + 'createdDate': '2023-10-05T13:15:27.649760Z', + 'storage': [ + { + 'region': 'us-central1', + 'cloudResource': 'bigquery', + 'cloudPlatform': 'gcp', + }, + { + 'region': 'us-east4', + 'cloudResource': 'firestore', + 'cloudPlatform': 'gcp', + }, + { + 'region': 'us-central1', + 'cloudResource': 'bucket', + 'cloudPlatform': 'gcp', + }, + ], + 'secureMonitoringEnabled': False, + 'cloudPlatform': 'gcp', + 'dataProject': 'datarepo-7242affb', + 'storageAccount': None, + 'phsId': None, + 'selfHosted': False, + 'predictableFileIds': False, + 'tags': [], + 'resourceLocks': { + 'exclusive': None, + 'shared': [], + }, + }, + { + 'id': 'beef77e8-575b-40e5-9340-a6f10e0bec67', + 'name': 'RP_3056', + 'description': 'RGP_HMB_DRAGEN_hg38. Dataset automatically created and linked to: RP-3056', + 'defaultProfileId': '835dd05d-c603-4b0d-926f-d9143fd24549', + 'createdDate': '2023-10-05T20:15:27.622481Z', + 'storage': [ + { + 'region': 'us-central1', + 'cloudResource': 'bigquery', + 'cloudPlatform': 'gcp', + }, + { + 'region': 'us-east4', + 'cloudResource': 'firestore', + 'cloudPlatform': 'gcp', + }, + { + 'region': 'us-central1', + 'cloudResource': 'bucket', + 'cloudPlatform': 'gcp', + }, + ], + 'secureMonitoringEnabled': False, + 'cloudPlatform': 'gcp', + 'dataProject': 'datarepo-5a72e31b', + 'storageAccount': None, + 'phsId': None, + 'selfHosted': False, + 'predictableFileIds': False, + 'tags': [], + 'resourceLocks': { + 'exclusive': None, + 'shared': [], + }, + }, + { + 'id': 'c8d74ac4-9a2e-4d3d-a6b5-1f4b433d949f', + 'name': 'RP_3055', + 'description': 'RGP_GRU_DRAGEN_hg38. Dataset automatically created and linked to: RP-3055', + 'defaultProfileId': '835dd05d-c603-4b0d-926f-d9143fd24549', + 'createdDate': '2023-10-05T20:15:28.255304Z', + 'storage': [ + { + 'region': 'us-central1', + 'cloudResource': 'bigquery', + 'cloudPlatform': 'gcp', + }, + { + 'region': 'us-east4', + 'cloudResource': 'firestore', + 'cloudPlatform': 'gcp', + }, + { + 'region': 'us-central1', + 'cloudResource': 'bucket', + 'cloudPlatform': 'gcp', + }, + ], + 'secureMonitoringEnabled': False, + 'cloudPlatform': 'gcp', + 'dataProject': 'datarepo-0f5be351', + 'storageAccount': None, + 'phsId': None, + 'selfHosted': False, + 'predictableFileIds': False, + 'tags': [], + 'resourceLocks': { + 'exclusive': None, + 'shared': [ + 'Gw_KTeYRS1aLhRvapMLYLg', + 'WrP-0w1aROOUbgkI8JS6Ug', + ], + }, + }, +] + + +@patch( + 'v03_pipeline.lib.misc.terra_data_repository.get_service_account_credentials', + return_value=SimpleNamespace( + token='abcdefg', # noqa: S106 + ), +) +class TerraDataRepositoryTest(unittest.TestCase): + @responses.activate + def test_get_dataset_ids(self, _: Mock) -> None: + responses.get( + os.path.join(TDR_ROOT_URL, 'datasets'), + body=json.dumps( + { + 'total': 3, + 'filteredTotal': 3, + 'items': TDR_DATASETS, + }, + ), + ) + self.assertListEqual( + _get_dataset_ids(), + [ + '2dc51ee0-a037-499d-a915-a7c20a0b216d', + 'beef77e8-575b-40e5-9340-a6f10e0bec67', + 'c8d74ac4-9a2e-4d3d-a6b5-1f4b433d949f', + ], + ) + + @responses.activate + def test_get_dataset_ids_no_bq(self, _: Mock) -> None: + responses.get( + os.path.join(TDR_ROOT_URL, 'datasets'), + body=json.dumps( + { + 'total': 1, + 'filteredTotal': 1, + 'items': [ + { + 'id': '2dc51ee0-a037-499d-a915-a7c20a0b216d', + 'name': 'RP_3053', + 'description': 'TGG_Ware_DRAGEN_hg38. Dataset automatically created and linked to: RP-3053', + 'defaultProfileId': '0a164b9a-2b8b-45d2-859e-a4e369b9cb4f', + 'createdDate': '2023-10-05T13:15:27.649760Z', + 'storage': [ + # NB: bigquery was removed from 'storage' here. + { + 'region': 'us-east4', + 'cloudResource': 'firestore', + 'cloudPlatform': 'gcp', + }, + { + 'region': 'us-central1', + 'cloudResource': 'bucket', + 'cloudPlatform': 'gcp', + }, + ], + 'secureMonitoringEnabled': False, + 'cloudPlatform': 'gcp', + 'dataProject': 'datarepo-7242affb', + 'storageAccount': None, + 'phsId': None, + 'selfHosted': False, + 'predictableFileIds': False, + 'tags': [], + 'resourceLocks': { + 'exclusive': None, + 'shared': [], + }, + }, + ], + }, + ), + ) + self.assertRaises( + ValueError, + _get_dataset_ids, + ) + + @responses.activate + def test_gen_bq_table_names(self, _: Mock) -> None: + responses.get( + os.path.join(TDR_ROOT_URL, 'datasets'), + body=json.dumps( + { + 'total': 3, + 'filteredTotal': 3, + 'items': TDR_DATASETS, + }, + ), + ) + for dataset_id, name, project_name, dataset_name in [ + ( + '2dc51ee0-a037-499d-a915-a7c20a0b216d', + 'RP_3053', + 'datarepo-7242affb', + 'datarepo_RP_3053', + ), + ( + 'beef77e8-575b-40e5-9340-a6f10e0bec67', + 'RP_3056', + 'datarepo-5a72e31b', + 'datarepo_RP_3056', + ), + ( + 'c8d74ac4-9a2e-4d3d-a6b5-1f4b433d949f', + 'RP_3059', + 'datarepo-aada2e3b', + 'datarepo_RP_3059', + ), + ]: + responses.get( + os.path.join( + TDR_ROOT_URL, + f'datasets/{dataset_id}?include=ACCESS_INFORMATION', + ), + body=json.dumps( + { + 'id': dataset_id, + 'name': name, + 'description': 'TGG_Ware_DRAGEN_hg38. Dataset automatically created and linked to: RP-3053', + 'defaultProfileId': None, + 'dataProject': None, + 'defaultSnapshotId': None, + 'schema': None, + 'createdDate': '2023-10-05T13:15:27.649760Z', + 'storage': None, + 'secureMonitoringEnabled': False, + 'phsId': None, + 'accessInformation': { + 'bigQuery': { + 'datasetName': dataset_name, + 'datasetId': f'{project_name}:{dataset_name}', + 'projectId': project_name, + 'link': 'https://console.cloud.google.com/bigquery?project=datarepo-7242affb&ws=!datarepo_RP_3053&d=datarepo_RP_3053&p=datarepo-7242affb&page=dataset', + 'tables': [ + { + 'name': 'jointcallset', + 'id': f'{project_name}.{dataset_name}.jointcallset', + 'qualifiedName': 'datarepo-7242affb.datarepo_RP_3053.jointcallset', + 'link': 'https://console.cloud.google.com/bigquery?project=datarepo-7242affb&ws=!datarepo_RP_3053&d=datarepo_RP_3053&p=datarepo-7242affb&page=table&t=jointcallset', + 'sampleQuery': 'SELECT * FROM `datarepo-7242affb.datarepo_RP_3053.jointcallset`', + }, + { + 'name': 'sample', + 'id': f'{project_name}.{dataset_name}.sample', + 'qualifiedName': 'datarepo-7242affb.datarepo_RP_3053.sample', + 'link': 'https://console.cloud.google.com/bigquery?project=datarepo-7242affb&ws=!datarepo_RP_3053&d=datarepo_RP_3053&p=datarepo-7242affb&page=table&t=sample', + 'sampleQuery': 'SELECT * FROM `datarepo-7242affb.datarepo_RP_3053.sample`', + }, + ], + }, + 'parquet': None, + }, + 'cloudPlatform': None, + 'selfHosted': False, + 'properties': None, + 'ingestServiceAccount': None, + 'predictableFileIds': False, + 'tags': [], + 'resourceLocks': { + 'exclusive': None, + 'shared': [], + }, + }, + ), + ) + self.assertCountEqual( + list(gen_bq_table_names()), + [ + 'datarepo-7242affb.datarepo_RP_3053', + 'datarepo-5a72e31b.datarepo_RP_3056', + 'datarepo-aada2e3b.datarepo_RP_3059', + ], + ) diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py index 281cffd9b..e7e8a28d3 100644 --- a/v03_pipeline/lib/model/dataset_type.py +++ b/v03_pipeline/lib/model/dataset_type.py @@ -186,6 +186,14 @@ def expect_filters( ) -> bool: return self == DatasetType.SNV_INDEL and sample_type == SampleType.WES + def expect_tdr_metrics( + self, + reference_genome: ReferenceGenome, + ) -> bool: + return ( + self == DatasetType.SNV_INDEL and reference_genome == ReferenceGenome.GRCh38 + ) + @property def has_gencode_gene_symbol_to_gene_id_mapping(self) -> bool: return self == DatasetType.SV diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index d95d7a27c..e307b0226 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -56,6 +56,7 @@ os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1' ) CHECK_SEX_AND_RELATEDNESS = os.environ.get('CHECK_SEX_AND_RELATEDNESS') == '1' +EXPECT_TDR_METRICS = os.environ.get('EXPECT_TDR_METRICS') == '1' EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1' INCLUDE_PIPELINE_VERSION_IN_PREFIX = ( os.environ.get('INCLUDE_PIPELINE_VERSION_IN_PREFIX') == '1' @@ -72,6 +73,7 @@ class Env: CLINGEN_ALLELE_REGISTRY_LOGIN: str | None = CLINGEN_ALLELE_REGISTRY_LOGIN CLINGEN_ALLELE_REGISTRY_PASSWORD: str | None = CLINGEN_ALLELE_REGISTRY_PASSWORD DEPLOYMENT_TYPE: Literal['dev', 'prod'] = DEPLOYMENT_TYPE + EXPECT_TDR_METRICS: bool = EXPECT_TDR_METRICS EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS GCLOUD_DATAPROC_SECONDARY_WORKERS: str = GCLOUD_DATAPROC_SECONDARY_WORKERS GCLOUD_PROJECT: str | None = GCLOUD_PROJECT diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 68d27bba2..5a54880fd 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -98,10 +98,9 @@ def family_table_path( ) -def imputed_sex_path( +def tdr_metrics_dir( reference_genome: ReferenceGenome, dataset_type: DatasetType, - callset_path: str, ) -> str: return os.path.join( _pipeline_prefix( @@ -109,8 +108,18 @@ def imputed_sex_path( reference_genome, dataset_type, ), - 'imputed_sex', - f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.tsv', + 'tdr_metrics', + ) + + +def tdr_metrics_path( + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + bq_table_name: str, +) -> str: + return os.path.join( + tdr_metrics_dir(reference_genome, dataset_type), + f'{bq_table_name}.tsv', ) diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py index f49e62b61..8c5f097f4 100644 --- a/v03_pipeline/lib/paths_test.py +++ b/v03_pipeline/lib/paths_test.py @@ -9,7 +9,6 @@ from v03_pipeline.lib.paths import ( family_table_path, imported_callset_path, - imputed_sex_path, lookup_table_path, metadata_for_run_path, new_variants_table_path, @@ -19,6 +18,7 @@ relatedness_check_table_path, remapped_and_subsetted_callset_path, sex_check_table_path, + tdr_metrics_path, valid_filters_path, validation_errors_for_run_path, variant_annotations_table_path, @@ -177,14 +177,14 @@ def test_imported_callset_path(self) -> None: '/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt', ) - def test_imputed_sex_path(self) -> None: + def test_tdr_metrics_path(self) -> None: self.assertEqual( - imputed_sex_path( + tdr_metrics_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, - 'gs://abc.efg/callset.vcf.gz', + 'datarepo-7242affb.datarepo_RP_3053', ), - '/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/imputed_sex/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.tsv', + '/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/tdr_metrics/datarepo-7242affb.datarepo_RP_3053.tsv', ) def test_new_variants_table_path(self) -> None: diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py index 7c79b00d6..48a70ec6f 100644 --- a/v03_pipeline/lib/tasks/base/base_loading_run_params.py +++ b/v03_pipeline/lib/tasks/base/base_loading_run_params.py @@ -34,6 +34,10 @@ class BaseLoadingRunParams(luigi.Task): default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, ) + skip_expect_tdr_metrics = luigi.BoolParameter( + default=False, + parsing=luigi.BoolParameter.EXPLICIT_PARSING, + ) skip_validation = luigi.BoolParameter( default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 90bba9ebf..9caa71e58 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -22,6 +22,7 @@ from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget +from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask from v03_pipeline.lib.tasks.write_validation_errors_for_run import ( WriteValidationErrorsForRunTask, ) @@ -60,6 +61,17 @@ def requires(self) -> list[luigi.Task]: ), ), ] + if ( + Env.EXPECT_TDR_METRICS + and not self.skip_expect_tdr_metrics + and self.dataset_type.expect_tdr_metrics( + self.reference_genome, + ) + ): + requirements = [ + *requirements, + self.clone(WriteTDRMetricsFilesTask), + ] return [ *requirements, CallsetTask(self.callset_path), diff --git a/v03_pipeline/lib/tasks/write_sex_check_table.py b/v03_pipeline/lib/tasks/write_sex_check_table.py index b8b4bb9e7..f87233507 100644 --- a/v03_pipeline/lib/tasks/write_sex_check_table.py +++ b/v03_pipeline/lib/tasks/write_sex_check_table.py @@ -1,10 +1,12 @@ import hail as hl +import hailtop.fs as hfs import luigi from v03_pipeline.lib.misc.io import import_imputed_sex -from v03_pipeline.lib.paths import imputed_sex_path, sex_check_table_path +from v03_pipeline.lib.paths import sex_check_table_path, tdr_metrics_dir from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask -from v03_pipeline.lib.tasks.files import GCSorLocalTarget, RawFileTask +from v03_pipeline.lib.tasks.files import GCSorLocalTarget +from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask class WriteSexCheckTableTask(BaseWriteTask): @@ -20,13 +22,15 @@ def output(self) -> luigi.Target: ) def requires(self) -> luigi.Task: - return RawFileTask( - imputed_sex_path( - self.reference_genome, - self.dataset_type, - self.callset_path, - ), - ) + return self.clone(WriteTDRMetricsFilesTask) def create_table(self) -> hl.Table: - return import_imputed_sex(self.input().path) + ht = None + for tdr_metrics_file in hfs.ls( + tdr_metrics_dir(self.reference_genome, self.dataset_type), + ): + if not ht: + ht = import_imputed_sex(tdr_metrics_file.path) + continue + ht = ht.union(import_imputed_sex(tdr_metrics_file.path)) + return ht diff --git a/v03_pipeline/lib/tasks/write_sex_check_table_test.py b/v03_pipeline/lib/tasks/write_sex_check_table_test.py new file mode 100644 index 000000000..2c935f44c --- /dev/null +++ b/v03_pipeline/lib/tasks/write_sex_check_table_test.py @@ -0,0 +1,96 @@ +from unittest.mock import Mock, patch + +import google.cloud.bigquery +import hail as hl +import luigi.worker + +from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.paths import sex_check_table_path, tdr_metrics_path +from v03_pipeline.lib.tasks.write_sex_check_table import ( + WriteSexCheckTableTask, +) +from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase + + +class WriteSexCheckTableTaskTest(MockedDatarootTestCase): + @patch('v03_pipeline.lib.tasks.write_tdr_metrics_files.gen_bq_table_names') + @patch('v03_pipeline.lib.tasks.write_tdr_metrics_file.bq_metrics_query') + def test_snv_sex_check_table_task( + self, + mock_bq_metrics_query: Mock, + mock_gen_bq_table_names: Mock, + ) -> None: + mock_gen_bq_table_names.return_value = [ + 'datarepo-7242affb.datarepo_RP_3053', + 'datarepo-5a72e31b.datarepo_RP_3056', + ] + mock_bq_metrics_query.side_effect = [ + iter( + [ + google.cloud.bigquery.table.Row( + ('SM-NJ8MF', 'Unknown'), + {'collaborator_sample_id': 0, 'predicted_sex': 1}, + ), + google.cloud.bigquery.table.Row( + ('SM-MWOGC', 'Female'), + {'collaborator_sample_id': 0, 'predicted_sex': 1}, + ), + google.cloud.bigquery.table.Row( + ('SM-MWKWL', 'Male'), + {'collaborator_sample_id': 0, 'predicted_sex': 1}, + ), + ], + ), + iter( + [ + google.cloud.bigquery.table.Row( + ('SM-NGE65', 'Male'), + {'collaborator_sample_id': 0, 'predicted_sex': 1}, + ), + google.cloud.bigquery.table.Row( + ('SM-NGE5G', 'Male'), + {'collaborator_sample_id': 0, 'predicted_sex': 1}, + ), + google.cloud.bigquery.table.Row( + ('SM-NC6LM', 'Male'), + {'collaborator_sample_id': 0, 'predicted_sex': 1}, + ), + ], + ), + ] + worker = luigi.worker.Worker() + write_sex_check_table = WriteSexCheckTableTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + callset_path='na', + ) + worker.add(write_sex_check_table) + worker.run() + self.assertTrue(write_sex_check_table.complete()) + sex_check_ht = hl.read_table( + sex_check_table_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + 'na', + ), + ) + self.assertEqual( + sex_check_ht.collect(), + [ + hl.Struct(s='SM-MWKWL', predicted_sex='M'), + hl.Struct(s='SM-MWOGC', predicted_sex='F'), + hl.Struct(s='SM-NC6LM', predicted_sex='M'), + hl.Struct(s='SM-NGE5G', predicted_sex='M'), + hl.Struct(s='SM-NGE65', predicted_sex='M'), + hl.Struct(s='SM-NJ8MF', predicted_sex='U'), + ], + ) + # Check underlying tdr metrics file. + with open( + tdr_metrics_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + 'datarepo-5a72e31b.datarepo_RP_3056', + ), + ) as f: + self.assertTrue('collaborator_sample_id' in f.read()) diff --git a/v03_pipeline/lib/tasks/write_tdr_metrics_file.py b/v03_pipeline/lib/tasks/write_tdr_metrics_file.py new file mode 100644 index 000000000..b1c7ad9c0 --- /dev/null +++ b/v03_pipeline/lib/tasks/write_tdr_metrics_file.py @@ -0,0 +1,35 @@ +import csv + +import luigi +import luigi.util + +from v03_pipeline.lib.misc.terra_data_repository import ( + BIGQUERY_METRICS, + bq_metrics_query, +) +from v03_pipeline.lib.paths import tdr_metrics_path +from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import ( + BaseLoadingPipelineParams, +) +from v03_pipeline.lib.tasks.files import GCSorLocalTarget + + +@luigi.util.inherits(BaseLoadingPipelineParams) +class WriteTDRMetricsFileTask(luigi.Task): + bq_table_name = luigi.Parameter() + + def output(self) -> luigi.Target: + return GCSorLocalTarget( + tdr_metrics_path( + self.reference_genome, + self.dataset_type, + self.bq_table_name, + ), + ) + + def run(self): + with self.output().open('w') as f: + writer = csv.DictWriter(f, fieldnames=BIGQUERY_METRICS, delimiter='\t') + writer.writeheader() + for row in bq_metrics_query(self.bq_table_name): + writer.writerow(row) diff --git a/v03_pipeline/lib/tasks/write_tdr_metrics_files.py b/v03_pipeline/lib/tasks/write_tdr_metrics_files.py new file mode 100644 index 000000000..c2f0d773e --- /dev/null +++ b/v03_pipeline/lib/tasks/write_tdr_metrics_files.py @@ -0,0 +1,28 @@ +import luigi +import luigi.util + +from v03_pipeline.lib.misc.terra_data_repository import gen_bq_table_names +from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import ( + BaseLoadingPipelineParams, +) +from v03_pipeline.lib.tasks.write_tdr_metrics_file import WriteTDRMetricsFileTask + + +@luigi.util.inherits(BaseLoadingPipelineParams) +class WriteTDRMetricsFilesTask(luigi.Task): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.dynamic_write_tdr_metrics_file_task = set() + + def complete(self) -> bool: + return len(self.dynamic_write_tdr_metrics_file_task) >= 1 and all( + write_tdr_metrics_file_task.complete() + for write_tdr_metrics_file_task in self.dynamic_write_tdr_metrics_file_task + ) + + def run(self): + for bq_table_name in gen_bq_table_names(): + self.dynamic_write_tdr_metrics_file_task.add( + self.clone(WriteTDRMetricsFileTask, bq_table_name=bq_table_name), + ) + yield self.dynamic_write_tdr_metrics_file_task From 0eef8976a97d36570922fbf146533e2e77a2ef60 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 13 Dec 2024 17:20:58 -0500 Subject: [PATCH 04/10] refactor: Move feature flags to FeatureFlag enum. (#1002) * refactor: Move feature flags out of environment to their own dataclass * lint: ruff * ruff --- v03_pipeline/bin/pipeline_worker.py | 4 +-- v03_pipeline/lib/model/__init__.py | 2 ++ v03_pipeline/lib/model/environment.py | 20 -------------- v03_pipeline/lib/model/feature_flag.py | 26 +++++++++++++++++++ v03_pipeline/lib/paths.py | 9 ++++--- v03_pipeline/lib/paths_test.py | 10 ++++--- .../reference_datasets/reference_dataset.py | 9 +++++-- .../tasks/dataproc/create_dataproc_cluster.py | 13 ++++++---- ...annotations_table_with_new_samples_test.py | 6 ++--- v03_pipeline/lib/tasks/validate_callset.py | 6 ++--- .../lib/tasks/write_imported_callset.py | 8 +++--- .../write_remapped_and_subsetted_callset.py | 6 ++--- ...ite_remapped_and_subsetted_callset_test.py | 6 ++--- 13 files changed, 72 insertions(+), 53 deletions(-) create mode 100644 v03_pipeline/lib/model/feature_flag.py diff --git a/v03_pipeline/bin/pipeline_worker.py b/v03_pipeline/bin/pipeline_worker.py index 7586fe8f6..7be97a9b0 100755 --- a/v03_pipeline/bin/pipeline_worker.py +++ b/v03_pipeline/bin/pipeline_worker.py @@ -7,7 +7,7 @@ from v03_pipeline.api.model import LoadingPipelineRequest from v03_pipeline.lib.logger import get_logger -from v03_pipeline.lib.model import Env +from v03_pipeline.lib.model import FeatureFlag from v03_pipeline.lib.paths import ( loading_pipeline_queue_path, project_pedigree_path, @@ -54,7 +54,7 @@ def main(): 'run_id': run_id, **{k: v for k, v in lpr.model_dump().items() if k != 'projects_to_run'}, } - if Env.SHOULD_TRIGGER_HAIL_BACKEND_RELOAD: + if FeatureFlag.SHOULD_TRIGGER_HAIL_BACKEND_RELOAD: tasks = [ TriggerHailBackendReload(**loading_run_task_params), ] diff --git a/v03_pipeline/lib/model/__init__.py b/v03_pipeline/lib/model/__init__.py index bb4325d47..45847f4f5 100644 --- a/v03_pipeline/lib/model/__init__.py +++ b/v03_pipeline/lib/model/__init__.py @@ -7,11 +7,13 @@ Sex, ) from v03_pipeline.lib.model.environment import Env +from v03_pipeline.lib.model.feature_flag import FeatureFlag __all__ = [ 'AccessControl', 'DatasetType', 'Env', + 'FeatureFlag', 'Sex', 'PipelineVersion', 'ReferenceGenome', diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index e307b0226..42ae4c658 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -51,30 +51,12 @@ GCLOUD_REGION = os.environ.get('GCLOUD_REGION') PIPELINE_RUNNER_APP_VERSION = os.environ.get('PIPELINE_RUNNER_APP_VERSION', 'latest') -# Feature Flags -ACCESS_PRIVATE_REFERENCE_DATASETS = ( - os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1' -) -CHECK_SEX_AND_RELATEDNESS = os.environ.get('CHECK_SEX_AND_RELATEDNESS') == '1' -EXPECT_TDR_METRICS = os.environ.get('EXPECT_TDR_METRICS') == '1' -EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1' -INCLUDE_PIPELINE_VERSION_IN_PREFIX = ( - os.environ.get('INCLUDE_PIPELINE_VERSION_IN_PREFIX') == '1' -) -SHOULD_TRIGGER_HAIL_BACKEND_RELOAD = ( - os.environ.get('SHOULD_TRIGGER_HAIL_BACKEND_RELOAD') == '1' -) - @dataclass class Env: - ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS - CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS CLINGEN_ALLELE_REGISTRY_LOGIN: str | None = CLINGEN_ALLELE_REGISTRY_LOGIN CLINGEN_ALLELE_REGISTRY_PASSWORD: str | None = CLINGEN_ALLELE_REGISTRY_PASSWORD DEPLOYMENT_TYPE: Literal['dev', 'prod'] = DEPLOYMENT_TYPE - EXPECT_TDR_METRICS: bool = EXPECT_TDR_METRICS - EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS GCLOUD_DATAPROC_SECONDARY_WORKERS: str = GCLOUD_DATAPROC_SECONDARY_WORKERS GCLOUD_PROJECT: str | None = GCLOUD_PROJECT GCLOUD_ZONE: str | None = GCLOUD_ZONE @@ -85,10 +67,8 @@ class Env: HAIL_BACKEND_SERVICE_PORT: int = HAIL_BACKEND_SERVICE_PORT HAIL_TMP_DIR: str = HAIL_TMP_DIR HAIL_SEARCH_DATA_DIR: str = HAIL_SEARCH_DATA_DIR - INCLUDE_PIPELINE_VERSION_IN_PREFIX: bool = INCLUDE_PIPELINE_VERSION_IN_PREFIX LOADING_DATASETS_DIR: str = LOADING_DATASETS_DIR PIPELINE_RUNNER_APP_VERSION: str = PIPELINE_RUNNER_APP_VERSION PRIVATE_REFERENCE_DATASETS_DIR: str = PRIVATE_REFERENCE_DATASETS_DIR REFERENCE_DATASETS_DIR: str = REFERENCE_DATASETS_DIR - SHOULD_TRIGGER_HAIL_BACKEND_RELOAD: bool = SHOULD_TRIGGER_HAIL_BACKEND_RELOAD VEP_REFERENCE_DATASETS_DIR: str = VEP_REFERENCE_DATASETS_DIR diff --git a/v03_pipeline/lib/model/feature_flag.py b/v03_pipeline/lib/model/feature_flag.py new file mode 100644 index 000000000..0b57e8643 --- /dev/null +++ b/v03_pipeline/lib/model/feature_flag.py @@ -0,0 +1,26 @@ +import os +from dataclasses import dataclass + +# Feature Flags +ACCESS_PRIVATE_REFERENCE_DATASETS = ( + os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1' +) +CHECK_SEX_AND_RELATEDNESS = os.environ.get('CHECK_SEX_AND_RELATEDNESS') == '1' +EXPECT_TDR_METRICS = os.environ.get('EXPECT_TDR_METRICS') == '1' +EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1' +INCLUDE_PIPELINE_VERSION_IN_PREFIX = ( + os.environ.get('INCLUDE_PIPELINE_VERSION_IN_PREFIX') == '1' +) +SHOULD_TRIGGER_HAIL_BACKEND_RELOAD = ( + os.environ.get('SHOULD_TRIGGER_HAIL_BACKEND_RELOAD') == '1' +) + + +@dataclass +class FeatureFlag: + ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS + CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS + EXPECT_TDR_METRICS: bool = EXPECT_TDR_METRICS + EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS + INCLUDE_PIPELINE_VERSION_IN_PREFIX: bool = INCLUDE_PIPELINE_VERSION_IN_PREFIX + SHOULD_TRIGGER_HAIL_BACKEND_RELOAD: bool = SHOULD_TRIGGER_HAIL_BACKEND_RELOAD diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 5a54880fd..2942b1dae 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -6,6 +6,7 @@ AccessControl, DatasetType, Env, + FeatureFlag, PipelineVersion, ReferenceGenome, SampleType, @@ -21,7 +22,7 @@ def _pipeline_prefix( reference_genome: ReferenceGenome, dataset_type: DatasetType, ) -> str: - if Env.INCLUDE_PIPELINE_VERSION_IN_PREFIX: + if FeatureFlag.INCLUDE_PIPELINE_VERSION_IN_PREFIX: return os.path.join( root, PipelineVersion.V3_1.value, @@ -45,7 +46,7 @@ def _v03_reference_data_prefix( if access_control == AccessControl.PRIVATE else Env.REFERENCE_DATASETS_DIR ) - if Env.INCLUDE_PIPELINE_VERSION_IN_PREFIX: + if FeatureFlag.INCLUDE_PIPELINE_VERSION_IN_PREFIX: return os.path.join( root, PipelineVersion.V03.value, @@ -68,7 +69,7 @@ def _v03_reference_dataset_prefix( if access_control == AccessControl.PRIVATE else Env.REFERENCE_DATASETS_DIR ) - if Env.INCLUDE_PIPELINE_VERSION_IN_PREFIX: + if FeatureFlag.INCLUDE_PIPELINE_VERSION_IN_PREFIX: return os.path.join( root, PipelineVersion.V3_1.value, @@ -287,7 +288,7 @@ def valid_filters_path( callset_path: str, ) -> str | None: if ( - not Env.EXPECT_WES_FILTERS + not FeatureFlag.EXPECT_WES_FILTERS or not dataset_type.expect_filters(sample_type) or 'part_one_outputs' not in callset_path ): diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py index 8c5f097f4..eac76232f 100644 --- a/v03_pipeline/lib/paths_test.py +++ b/v03_pipeline/lib/paths_test.py @@ -36,7 +36,9 @@ def test_family_table_path(self) -> None: ), '/var/seqr/seqr-hail-search-data/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht', ) - with patch('v03_pipeline.lib.paths.Env') as mock_env: + with patch('v03_pipeline.lib.paths.Env') as mock_env, patch( + 'v03_pipeline.lib.paths.FeatureFlag', + ) as mock_ff: mock_env.HAIL_SEARCH_DATA_DIR = 'gs://seqr-datasets/' self.assertEqual( family_table_path( @@ -47,7 +49,7 @@ def test_family_table_path(self) -> None: ), 'gs://seqr-datasets/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht', ) - mock_env.INCLUDE_PIPELINE_VERSION_IN_PREFIX = False + mock_ff.INCLUDE_PIPELINE_VERSION_IN_PREFIX = False self.assertEqual( family_table_path( ReferenceGenome.GRCh37, @@ -67,8 +69,8 @@ def test_valid_filters_path(self) -> None: ), None, ) - with patch('v03_pipeline.lib.paths.Env') as mock_env: - mock_env.EXPECT_WES_FILTERS = True + with patch('v03_pipeline.lib.paths.FeatureFlag') as mock_ff: + mock_ff.EXPECT_WES_FILTERS = True self.assertEqual( valid_filters_path( DatasetType.SNV_INDEL, diff --git a/v03_pipeline/lib/reference_datasets/reference_dataset.py b/v03_pipeline/lib/reference_datasets/reference_dataset.py index 47480e723..445e191aa 100644 --- a/v03_pipeline/lib/reference_datasets/reference_dataset.py +++ b/v03_pipeline/lib/reference_datasets/reference_dataset.py @@ -10,7 +10,12 @@ validate_allele_type, validate_no_duplicate_variants, ) -from v03_pipeline.lib.model import AccessControl, DatasetType, Env, ReferenceGenome +from v03_pipeline.lib.model import ( + AccessControl, + DatasetType, + FeatureFlag, + ReferenceGenome, +) from v03_pipeline.lib.reference_datasets import clinvar, dbnsfp from v03_pipeline.lib.reference_datasets.misc import ( compress_floats, @@ -41,7 +46,7 @@ def for_reference_genome_dataset_type( for dataset, config in CONFIG.items() if dataset_type in config.get(reference_genome, {}).get(DATASET_TYPES, []) ] - if not Env.ACCESS_PRIVATE_REFERENCE_DATASETS: + if not FeatureFlag.ACCESS_PRIVATE_REFERENCE_DATASETS: return { dataset for dataset in reference_datasets diff --git a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py index f7d0bc21d..27693c5d3 100644 --- a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py +++ b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py @@ -7,7 +7,7 @@ from v03_pipeline.lib.logger import get_logger from v03_pipeline.lib.misc.gcp import get_service_account_credentials -from v03_pipeline.lib.model import Env, ReferenceGenome +from v03_pipeline.lib.model import Env, FeatureFlag, ReferenceGenome from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import ( BaseLoadingPipelineParams, ) @@ -88,18 +88,21 @@ def get_cluster_config(reference_genome: ReferenceGenome, run_id: str): 'spark:spark.executorEnv.HAIL_WORKER_OFF_HEAP_MEMORY_PER_CORE_MB': '6323', 'spark:spark.speculation': 'true', 'spark-env:ACCESS_PRIVATE_REFERENCE_DATASETS': '1' - if Env.ACCESS_PRIVATE_REFERENCE_DATASETS + if FeatureFlag.ACCESS_PRIVATE_REFERENCE_DATASETS else '0', 'spark-env:CHECK_SEX_AND_RELATEDNESS': '1' - if Env.CHECK_SEX_AND_RELATEDNESS + if FeatureFlag.CHECK_SEX_AND_RELATEDNESS + else '0', + 'spark-env:EXPECT_TDR_METRICS': '1' + if FeatureFlag.EXPECT_TDR_METRICS else '0', 'spark-env:EXPECT_WES_FILTERS': '1' - if Env.EXPECT_WES_FILTERS + if FeatureFlag.EXPECT_WES_FILTERS else '0', 'spark-env:HAIL_SEARCH_DATA_DIR': Env.HAIL_SEARCH_DATA_DIR, 'spark-env:HAIL_TMP_DIR': Env.HAIL_TMP_DIR, 'spark-env:INCLUDE_PIPELINE_VERSION_IN_PREFIX': '1' - if Env.INCLUDE_PIPELINE_VERSION_IN_PREFIX + if FeatureFlag.INCLUDE_PIPELINE_VERSION_IN_PREFIX else '0', 'spark-env:LOADING_DATASETS_DIR': Env.LOADING_DATASETS_DIR, 'spark-env:PRIVATE_REFERENCE_DATASETS_DIR': Env.PRIVATE_REFERENCE_DATASETS_DIR, diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index d29b58ff6..47511c695 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -712,7 +712,7 @@ def test_update_vat_grch37( @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset', ) - @patch('v03_pipeline.lib.reference_datasets.reference_dataset.Env') + @patch('v03_pipeline.lib.reference_datasets.reference_dataset.FeatureFlag') @patch('v03_pipeline.lib.vep.hl.vep') @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.load_gencode_ensembl_to_refseq_id', @@ -721,7 +721,7 @@ def test_update_vat_without_accessing_private_datasets( self, mock_load_gencode_ensembl_to_refseq_id: Mock, mock_vep: Mock, - mock_rd_env: Mock, + mock_rd_ff: Mock, mock_update_vat_with_rd_task: Mock, mock_register_alleles: Mock, ) -> None: @@ -740,7 +740,7 @@ def test_update_vat_without_accessing_private_datasets( ReferenceDataset.hgmd, ), ) - mock_rd_env.ACCESS_PRIVATE_REFERENCE_DATASETS = False + mock_rd_ff.ACCESS_PRIVATE_REFERENCE_DATASETS = False mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_38_VEP_DATA) mock_register_alleles.side_effect = None diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py index e5601875b..3d0d979db 100644 --- a/v03_pipeline/lib/tasks/validate_callset.py +++ b/v03_pipeline/lib/tasks/validate_callset.py @@ -10,7 +10,7 @@ validate_no_duplicate_variants, validate_sample_type, ) -from v03_pipeline.lib.model.environment import Env +from v03_pipeline.lib.model.feature_flag import FeatureFlag from v03_pipeline.lib.paths import ( imported_callset_path, sex_check_table_path, @@ -41,7 +41,7 @@ def get_validation_dependencies(self) -> dict[str, hl.Table]: ), ) if ( - Env.CHECK_SEX_AND_RELATEDNESS + FeatureFlag.CHECK_SEX_AND_RELATEDNESS and self.dataset_type.check_sex_and_relatedness and not self.skip_check_sex_and_relatedness ): @@ -86,7 +86,7 @@ def requires(self) -> list[luigi.Task]: ), ] if ( - Env.CHECK_SEX_AND_RELATEDNESS + FeatureFlag.CHECK_SEX_AND_RELATEDNESS and self.dataset_type.check_sex_and_relatedness and not self.skip_check_sex_and_relatedness ): diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 9caa71e58..cc827df52 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -14,7 +14,7 @@ validate_imported_field_types, ) from v03_pipeline.lib.misc.vets import annotate_vets -from v03_pipeline.lib.model.environment import Env +from v03_pipeline.lib.model.feature_flag import FeatureFlag from v03_pipeline.lib.paths import ( imported_callset_path, valid_filters_path, @@ -45,7 +45,7 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [] if ( - Env.EXPECT_WES_FILTERS + FeatureFlag.EXPECT_WES_FILTERS and not self.skip_expect_filters and self.dataset_type.expect_filters( self.sample_type, @@ -62,7 +62,7 @@ def requires(self) -> list[luigi.Task]: ), ] if ( - Env.EXPECT_TDR_METRICS + FeatureFlag.EXPECT_TDR_METRICS and not self.skip_expect_tdr_metrics and self.dataset_type.expect_tdr_metrics( self.reference_genome, @@ -87,7 +87,7 @@ def create_table(self) -> hl.MatrixTable: ) filters_path = None if ( - Env.EXPECT_WES_FILTERS + FeatureFlag.EXPECT_WES_FILTERS and not self.skip_expect_filters and self.dataset_type.expect_filters( self.sample_type, diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index f4c934662..41b4c148e 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -16,7 +16,7 @@ ) from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples -from v03_pipeline.lib.model.environment import Env +from v03_pipeline.lib.model.feature_flag import FeatureFlag from v03_pipeline.lib.paths import ( relatedness_check_table_path, remapped_and_subsetted_callset_path, @@ -62,7 +62,7 @@ def requires(self) -> list[luigi.Task]: RawFileTask(self.project_pedigree_paths[self.project_i]), ] if ( - Env.CHECK_SEX_AND_RELATEDNESS + FeatureFlag.CHECK_SEX_AND_RELATEDNESS and self.dataset_type.check_sex_and_relatedness and not self.skip_check_sex_and_relatedness ): @@ -98,7 +98,7 @@ def create_table(self) -> hl.MatrixTable: families_failed_relatedness_check = {} families_failed_sex_check = {} if ( - Env.CHECK_SEX_AND_RELATEDNESS + FeatureFlag.CHECK_SEX_AND_RELATEDNESS and self.dataset_type.check_sex_and_relatedness and not self.skip_check_sex_and_relatedness ): diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index 4a0c84660..5ac04c278 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -115,12 +115,12 @@ def test_write_remapped_and_subsetted_callset_task( ], ) - @patch('v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset.Env') + @patch('v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset.FeatureFlag') def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( self, - mock_env: Mock, + mock_ff: Mock, ) -> None: - mock_env.CHECK_SEX_AND_RELATEDNESS = True + mock_ff.CHECK_SEX_AND_RELATEDNESS = True worker = luigi.worker.Worker() wrsc_task = WriteRemappedAndSubsettedCallsetTask( reference_genome=ReferenceGenome.GRCh38, From 2e8dbcfef31ee03a43e157d915287f6a7cf7349e Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 18 Dec 2024 14:30:22 -0800 Subject: [PATCH 05/10] bugfix: exclude samples from relationship checking that are not present in the expected loadable samples (#1003) * bugfix: exclude samples from relationship checking that are not present in the expected loadable samples * cleanup --- .../lib/misc/family_loading_failures.py | 115 +++++++----------- .../lib/misc/family_loading_failures_test.py | 50 +++++++- v03_pipeline/lib/misc/pedigree.py | 8 +- 3 files changed, 97 insertions(+), 76 deletions(-) diff --git a/v03_pipeline/lib/misc/family_loading_failures.py b/v03_pipeline/lib/misc/family_loading_failures.py index e3b1b59db..438b1706e 100644 --- a/v03_pipeline/lib/misc/family_loading_failures.py +++ b/v03_pipeline/lib/misc/family_loading_failures.py @@ -16,7 +16,8 @@ def passes_relatedness_check( relatedness_check_lookup: dict[tuple[str, str], list], sample_id: str, other_id: str, - relation: Relation, + expected_relation: Relation, + additional_allowed_relation: Relation | None, ) -> tuple[bool, str | None]: # No relationship to check, return true if other_id is None: @@ -24,85 +25,62 @@ def passes_relatedness_check( coefficients = relatedness_check_lookup.get( (min(sample_id, other_id), max(sample_id, other_id)), ) - if not coefficients or not np.allclose( - coefficients, - relation.coefficients, - atol=RELATEDNESS_TOLERANCE, + if not coefficients or not any( + np.allclose( + coefficients, + relation.coefficients, + atol=RELATEDNESS_TOLERANCE, + ) + for relation in ( + [expected_relation, additional_allowed_relation] + if additional_allowed_relation + else [expected_relation] + ) ): return ( False, - f'Sample {sample_id} has expected relation "{relation.value}" to {other_id} but has coefficients {coefficients or []}', + f'Sample {sample_id} has expected relation "{expected_relation.value}" to {other_id} but has coefficients {coefficients or []}', ) return True, None -def all_relatedness_checks( # noqa: C901 +def all_relatedness_checks( relatedness_check_lookup: dict[tuple[str, str], list], + family: Family, sample: Sample, ) -> list[str]: failure_reasons = [] - for parent_id in [sample.mother, sample.father]: - success, reason = passes_relatedness_check( - relatedness_check_lookup, - sample.sample_id, - parent_id, - Relation.PARENT, - ) - if not success: - failure_reasons.append(reason) - - for grandparent_id in [ - sample.maternal_grandmother, - sample.maternal_grandfather, - sample.paternal_grandmother, - sample.paternal_grandfather, + for relationship_set, relation, additional_allowed_relation in [ + ([sample.mother, sample.father], Relation.PARENT_CHILD, None), + ( + [ + sample.maternal_grandmother, + sample.maternal_grandfather, + sample.paternal_grandmother, + sample.paternal_grandfather, + ], + Relation.GRANDPARENT_GRANDCHILD, + None, + ), + (sample.siblings, Relation.SIBLING, None), + (sample.half_siblings, Relation.HALF_SIBLING, Relation.SIBLING), + (sample.aunt_nephews, Relation.AUNT_NEPHEW, None), ]: - success, reason = passes_relatedness_check( - relatedness_check_lookup, - sample.sample_id, - grandparent_id, - Relation.GRANDPARENT, - ) - if not success: - failure_reasons.append(reason) - - for sibling_id in sample.siblings: - success, reason = passes_relatedness_check( - relatedness_check_lookup, - sample.sample_id, - sibling_id, - Relation.SIBLING, - ) - if not success: - failure_reasons.append(reason) - - for half_sibling_id in sample.half_siblings: - # NB: A "half sibling" parsed from the pedigree may actually be a sibling, so we allow those - # through as well. - success1, _ = passes_relatedness_check( - relatedness_check_lookup, - sample.sample_id, - half_sibling_id, - Relation.SIBLING, - ) - success2, reason = passes_relatedness_check( - relatedness_check_lookup, - sample.sample_id, - half_sibling_id, - Relation.HALF_SIBLING, - ) - if not success1 and not success2: - failure_reasons.append(reason) - - for aunt_nephew_id in sample.aunt_nephews: - success, reason = passes_relatedness_check( - relatedness_check_lookup, - sample.sample_id, - aunt_nephew_id, - Relation.AUNT_NEPHEW, - ) - if not success: - failure_reasons.append(reason) + for other_id in relationship_set: + # Handle case where relation is identified in the + # pedigree as a "dummy" but is not included in + # the list of samples to load. + if other_id not in family.samples: + continue + success, reason = passes_relatedness_check( + relatedness_check_lookup, + sample.sample_id, + other_id, + relation, + additional_allowed_relation, + ) + if not success: + failure_reasons.append(reason) return failure_reasons @@ -162,6 +140,7 @@ def get_families_failed_relatedness_check( for sample in family.samples.values(): failure_reasons = all_relatedness_checks( relatedness_check_lookup, + family, sample, ) if failure_reasons: diff --git a/v03_pipeline/lib/misc/family_loading_failures_test.py b/v03_pipeline/lib/misc/family_loading_failures_test.py index bcd783f5f..c9b1858b2 100644 --- a/v03_pipeline/lib/misc/family_loading_failures_test.py +++ b/v03_pipeline/lib/misc/family_loading_failures_test.py @@ -9,7 +9,7 @@ get_families_failed_sex_check, ) from v03_pipeline.lib.misc.io import import_pedigree -from v03_pipeline.lib.misc.pedigree import Sample, parse_pedigree_ht_to_families +from v03_pipeline.lib.misc.pedigree import Family, Sample, parse_pedigree_ht_to_families from v03_pipeline.lib.model import Sex TEST_PEDIGREE_6 = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv' @@ -104,7 +104,21 @@ def test_all_relatedness_checks(self): paternal_grandfather='sample_3', half_siblings=['sample_4'], ) - failure_reasons = all_relatedness_checks(relatedness_check_lookup, sample) + family = Family( + family_guid='family_1a', + samples={ + 'sample_1': sample, + 'sample_2': Sample(sex=Sex.MALE, sample_id='sample_2'), + 'sample_3': Sample(sex=Sex.MALE, sample_id='sample_3'), + 'sample_4': Sample(sex=Sex.MALE, sample_id='sample_4'), + 'sample_5': Sample(sex=Sex.MALE, sample_id='sample_5'), + }, + ) + failure_reasons = all_relatedness_checks( + relatedness_check_lookup, + family, + sample, + ) self.assertListEqual(failure_reasons, []) # Defined grandparent missing in relatedness table @@ -117,12 +131,13 @@ def test_all_relatedness_checks(self): ) failure_reasons = all_relatedness_checks( relatedness_check_lookup, + family, sample, ) self.assertListEqual( failure_reasons, [ - 'Sample sample_1 has expected relation "grandparent" to sample_5 but has coefficients []', + 'Sample sample_1 has expected relation "grandparent_grandchild" to sample_5 but has coefficients []', ], ) @@ -140,6 +155,7 @@ def test_all_relatedness_checks(self): ) failure_reasons = all_relatedness_checks( relatedness_check_lookup, + family, sample, ) self.assertListEqual( @@ -167,16 +183,42 @@ def test_all_relatedness_checks(self): ) failure_reasons = all_relatedness_checks( relatedness_check_lookup, + family, sample, ) self.assertListEqual( failure_reasons, [ - 'Sample sample_1 has expected relation "parent" to sample_2 but has coefficients [0.5, 0.5, 0.5, 0.5]', + 'Sample sample_1 has expected relation "parent_child" to sample_2 but has coefficients [0.5, 0.5, 0.5, 0.5]', 'Sample sample_1 has expected relation "sibling" to sample_4 but has coefficients [0.5, 0.5, 0, 0.25]', ], ) + # Some samples will include relationships with + # samples that are not expected to be included + # in the callset. These should not trigger relatedness + # failures. + sample = Sample( + sex=Sex.FEMALE, + sample_id='sample_1', + mother='sample_2', + ) + family = Family( + family_guid='family_1a', + samples={ + 'sample_1': sample, + }, + ) + failure_reasons = all_relatedness_checks( + {}, + family, + sample, + ) + self.assertListEqual( + failure_reasons, + [], + ) + def test_get_families_failed_sex_check(self): sex_check_ht = hl.Table.parallelize( [ diff --git a/v03_pipeline/lib/misc/pedigree.py b/v03_pipeline/lib/misc/pedigree.py index ee2d86521..2e17f27f0 100644 --- a/v03_pipeline/lib/misc/pedigree.py +++ b/v03_pipeline/lib/misc/pedigree.py @@ -8,8 +8,8 @@ class Relation(Enum): - PARENT = 'parent' - GRANDPARENT = 'grandparent' + PARENT_CHILD = 'parent_child' + GRANDPARENT_GRANDCHILD = 'grandparent_grandchild' SIBLING = 'sibling' HALF_SIBLING = 'half_sibling' AUNT_NEPHEW = 'aunt_nephew' @@ -17,8 +17,8 @@ class Relation(Enum): @property def coefficients(self): return { - Relation.PARENT: [0, 1, 0, 0.5], - Relation.GRANDPARENT: [0.5, 0.5, 0, 0.25], + Relation.PARENT_CHILD: [0, 1, 0, 0.5], + Relation.GRANDPARENT_GRANDCHILD: [0.5, 0.5, 0, 0.25], Relation.SIBLING: [0.25, 0.5, 0.25, 0.5], Relation.HALF_SIBLING: [0.5, 0.5, 0, 0.25], Relation.AUNT_NEPHEW: [0.5, 0.5, 0, 0.25], From ddf867ac1edcb4a2aaff65532ae5d8a00a91288e Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 2 Jan 2025 10:48:11 -0500 Subject: [PATCH 06/10] =?UTF-8?q?feat:=20add=20remap=20and=20family=20load?= =?UTF-8?q?ing=20failures=20as=20validation=20exceptions=20=E2=80=A6=20(#1?= =?UTF-8?q?005)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: add remap and family loading failures as validation exceptions rather than runtime errors * move on * Update write_remapped_and_subsetted_callset_test.py * ruff --- v03_pipeline/lib/misc/sample_ids.py | 22 +-- v03_pipeline/lib/misc/sample_ids_test.py | 10 +- .../lib/tasks/validate_callset_test.py | 1 + .../lib/tasks/write_imported_callset.py | 108 ++++++------ .../write_remapped_and_subsetted_callset.py | 61 ++++--- ...ite_remapped_and_subsetted_callset_test.py | 161 ++++++++++++++++++ .../tasks/write_validation_errors_for_run.py | 34 ++++ .../var/test/pedigrees/test_pedigree_7.tsv | 14 ++ 8 files changed, 299 insertions(+), 112 deletions(-) create mode 100644 v03_pipeline/var/test/pedigrees/test_pedigree_7.tsv diff --git a/v03_pipeline/lib/misc/sample_ids.py b/v03_pipeline/lib/misc/sample_ids.py index 77f173ffc..7a1c2bc0b 100644 --- a/v03_pipeline/lib/misc/sample_ids.py +++ b/v03_pipeline/lib/misc/sample_ids.py @@ -3,28 +3,16 @@ import hail as hl from v03_pipeline.lib.logger import get_logger +from v03_pipeline.lib.misc.validation import SeqrValidationError logger = get_logger(__name__) -class MatrixTableSampleSetError(Exception): - def __init__(self, message, missing_samples): - super().__init__(message) - self.missing_samples = missing_samples - - -def vcf_remap(mt: hl.MatrixTable) -> hl.MatrixTable: - # TODO: add logic from Mike to remap vcf samples delivered from Broad WGS - return mt - - def remap_sample_ids( mt: hl.MatrixTable, project_remap_ht: hl.Table, ignore_missing_samples_when_remapping: bool, ) -> hl.MatrixTable: - mt = vcf_remap(mt) - collected_remap = project_remap_ht.collect() s_dups = [k for k, v in Counter([r.s for r in collected_remap]).items() if v > 1] seqr_dups = [ @@ -33,7 +21,7 @@ def remap_sample_ids( if len(s_dups) > 0 or len(seqr_dups) > 0: msg = f'Duplicate s or seqr_id entries in remap file were found. Duplicate s:{s_dups}. Duplicate seqr_id:{seqr_dups}.' - raise ValueError(msg) + raise SeqrValidationError(msg) missing_samples = project_remap_ht.anti_join(mt.cols()).collect() remap_count = len(collected_remap) @@ -48,7 +36,7 @@ def remap_sample_ids( if ignore_missing_samples_when_remapping: logger.info(message) else: - raise MatrixTableSampleSetError(message, missing_samples) + raise SeqrValidationError(message) mt = mt.annotate_cols(**project_remap_ht[mt.s]) remap_expr = hl.if_else(hl.is_missing(mt.seqr_id), mt.s, mt.seqr_id) @@ -67,7 +55,7 @@ def subset_samples( anti_join_ht_count = anti_join_ht.count() if subset_count == 0: message = '0 sample ids found the subset HT, something is probably wrong.' - raise MatrixTableSampleSetError(message, []) + raise SeqrValidationError(message) if anti_join_ht_count != 0: missing_samples = anti_join_ht.s.collect() @@ -77,7 +65,7 @@ def subset_samples( f"IDs that aren't in the callset: {missing_samples}\n" f'All callset sample IDs:{mt.s.collect()}' ) - raise MatrixTableSampleSetError(message, missing_samples) + raise SeqrValidationError(message) logger.info(f'Subsetted to {subset_count} sample ids') mt = mt.semi_join_cols(sample_subset_ht) return mt.filter_rows(hl.agg.any(hl.is_defined(mt.GT))) diff --git a/v03_pipeline/lib/misc/sample_ids_test.py b/v03_pipeline/lib/misc/sample_ids_test.py index db264a1f9..770f5451e 100644 --- a/v03_pipeline/lib/misc/sample_ids_test.py +++ b/v03_pipeline/lib/misc/sample_ids_test.py @@ -3,10 +3,10 @@ import hail as hl from v03_pipeline.lib.misc.sample_ids import ( - MatrixTableSampleSetError, remap_sample_ids, subset_samples, ) +from v03_pipeline.lib.misc.validation import SeqrValidationError CALLSET_MT = hl.MatrixTable.from_parts( rows={'variants': [1, 2]}, @@ -76,7 +76,7 @@ def test_remap_sample_ids_remap_has_duplicate(self) -> None: key='s', ) - with self.assertRaises(ValueError): + with self.assertRaises(SeqrValidationError): remap_sample_ids( CALLSET_MT, project_remap_ht, @@ -99,7 +99,7 @@ def test_remap_sample_ids_remap_has_missing_samples(self) -> None: key='s', ) - with self.assertRaises(MatrixTableSampleSetError): + with self.assertRaises(SeqrValidationError): remap_sample_ids( CALLSET_MT, project_remap_ht, @@ -114,7 +114,7 @@ def test_subset_samples_zero_samples(self): key='s', ) - with self.assertRaises(MatrixTableSampleSetError): + with self.assertRaises(SeqrValidationError): subset_samples( CALLSET_MT, sample_subset_ht, @@ -132,7 +132,7 @@ def test_subset_samples_missing_samples(self): key='s', ) - with self.assertRaises(MatrixTableSampleSetError): + with self.assertRaises(SeqrValidationError): subset_samples( CALLSET_MT, sample_subset_ht, diff --git a/v03_pipeline/lib/tasks/validate_callset_test.py b/v03_pipeline/lib/tasks/validate_callset_test.py index 8f3638376..a6d2b0377 100644 --- a/v03_pipeline/lib/tasks/validate_callset_test.py +++ b/v03_pipeline/lib/tasks/validate_callset_test.py @@ -83,5 +83,6 @@ def test_validate_callset_multiple_exceptions( 'Missing the following expected contigs:chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr20, chr21, chr22, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX', 'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants', ], + 'failed_family_samples': {}, }, ) diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index cc827df52..c544b4666 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -10,7 +10,6 @@ split_multi_hts, ) from v03_pipeline.lib.misc.validation import ( - SeqrValidationError, validate_imported_field_types, ) from v03_pipeline.lib.misc.vets import annotate_vets @@ -24,7 +23,7 @@ from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask from v03_pipeline.lib.tasks.write_validation_errors_for_run import ( - WriteValidationErrorsForRunTask, + with_persisted_validation_errors, ) @@ -77,65 +76,56 @@ def requires(self) -> list[luigi.Task]: CallsetTask(self.callset_path), ] + @with_persisted_validation_errors def create_table(self) -> hl.MatrixTable: - try: - # NB: throws SeqrValidationError - mt = import_callset( - self.callset_path, - self.reference_genome, - self.dataset_type, - ) - filters_path = None - if ( - FeatureFlag.EXPECT_WES_FILTERS - and not self.skip_expect_filters - and self.dataset_type.expect_filters( - self.sample_type, - ) - ): - filters_path = valid_filters_path( - self.dataset_type, - self.sample_type, - self.callset_path, - ) - filters_ht = import_vcf(filters_path, self.reference_genome).rows() - mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters) - additional_row_fields = get_additional_row_fields( - mt, - self.dataset_type, - self.skip_check_sex_and_relatedness, + # NB: throws SeqrValidationError + mt = import_callset( + self.callset_path, + self.reference_genome, + self.dataset_type, + ) + filters_path = None + if ( + FeatureFlag.EXPECT_WES_FILTERS + and not self.skip_expect_filters + and self.dataset_type.expect_filters( + self.sample_type, ) - # NB: throws SeqrValidationError - mt = select_relevant_fields( - mt, + ): + filters_path = valid_filters_path( self.dataset_type, - additional_row_fields, + self.sample_type, + self.callset_path, ) - # This validation isn't override-able by the skip option. - # If a field is the wrong type, the pipeline will likely hard-fail downstream. + filters_ht = import_vcf(filters_path, self.reference_genome).rows() + mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters) + additional_row_fields = get_additional_row_fields( + mt, + self.dataset_type, + self.skip_check_sex_and_relatedness, + ) + # NB: throws SeqrValidationError + mt = select_relevant_fields( + mt, + self.dataset_type, + additional_row_fields, + ) + # This validation isn't override-able by the skip option. + # If a field is the wrong type, the pipeline will likely hard-fail downstream. + # NB: throws SeqrValidationError + validate_imported_field_types( + mt, + self.dataset_type, + additional_row_fields, + ) + if self.dataset_type.has_multi_allelic_variants: # NB: throws SeqrValidationError - validate_imported_field_types( - mt, - self.dataset_type, - additional_row_fields, - ) - if self.dataset_type.has_multi_allelic_variants: - # NB: throws SeqrValidationError - mt = split_multi_hts(mt, self.skip_validation) - # Special handling of variant-level filter annotation for VETs filters. - # The annotations are present on the sample-level FT field but are - # expected upstream on "filters". - mt = annotate_vets(mt) - return mt.select_globals( - callset_path=self.callset_path, - filters_path=filters_path or hl.missing(hl.tstr), - ) - except SeqrValidationError as e: - write_validation_errors_for_run_task = self.clone( - WriteValidationErrorsForRunTask, - error_messages=[str(e)], - ) - write_validation_errors_for_run_task.run() - raise SeqrValidationError( - write_validation_errors_for_run_task.to_single_error_message(), - ) from e + mt = split_multi_hts(mt, self.skip_validation) + # Special handling of variant-level filter annotation for VETs filters. + # The annotations are present on the sample-level FT field but are + # expected upstream on "filters". + mt = annotate_vets(mt) + return mt.select_globals( + callset_path=self.callset_path, + filters_path=filters_path or hl.missing(hl.tstr), + ) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index 41b4c148e..f099dcfcc 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -2,7 +2,6 @@ import luigi import luigi.util -from v03_pipeline.lib.logger import get_logger from v03_pipeline.lib.misc.family_loading_failures import ( get_families_failed_missing_samples, get_families_failed_relatedness_check, @@ -16,6 +15,7 @@ ) from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples +from v03_pipeline.lib.misc.validation import SeqrValidationError from v03_pipeline.lib.model.feature_flag import FeatureFlag from v03_pipeline.lib.paths import ( relatedness_check_table_path, @@ -29,8 +29,19 @@ WriteRelatednessCheckTsvTask, ) from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask +from v03_pipeline.lib.tasks.write_validation_errors_for_run import ( + with_persisted_validation_errors, +) + -logger = get_logger(__name__) +def format_failures(failed_families): + return { + f.family_guid: { + 'samples': sorted(f.samples.keys()), + 'reasons': reasons, + } + for f, reasons in failed_families.items() + } @luigi.util.inherits(BaseLoadingRunParams) @@ -73,6 +84,7 @@ def requires(self) -> list[luigi.Task]: ] return requirements + @with_persisted_validation_errors def create_table(self) -> hl.MatrixTable: callset_mt = hl.read_matrix_table(self.input()[0].path) pedigree_ht = import_pedigree(self.input()[1].path) @@ -130,16 +142,21 @@ def create_table(self) -> hl.MatrixTable: - families_failed_sex_check.keys() ) if not len(loadable_families): - msg = ( - f'families_failed_missing_samples: {families_failed_missing_samples}\n' - f'families_failed_relatedness_check: {families_failed_relatedness_check}\n' - f'families_failed_sex_check: {families_failed_sex_check}' - ) - logger.info( + msg = 'All families failed validation checks' + raise SeqrValidationError( msg, + { + 'failed_family_samples': { + 'missing_samples': format_failures( + families_failed_missing_samples, + ), + 'relatedness_check': format_failures( + families_failed_relatedness_check, + ), + 'sex_check': format_failures(families_failed_sex_check), + }, + }, ) - msg = 'All families failed checks' - raise RuntimeError(msg) mt = subset_samples( callset_mt, @@ -172,33 +189,15 @@ def create_table(self) -> hl.MatrixTable: ), failed_family_samples=hl.Struct( missing_samples=( - { - f.family_guid: { - 'samples': sorted(f.samples.keys()), - 'reasons': reasons, - } - for f, reasons in families_failed_missing_samples.items() - } + format_failures(families_failed_missing_samples) or hl.empty_dict(hl.tstr, hl.tdict(hl.tstr, hl.tarray(hl.tstr))) ), relatedness_check=( - { - f.family_guid: { - 'samples': sorted(f.samples.keys()), - 'reasons': reasons, - } - for f, reasons in families_failed_relatedness_check.items() - } + format_failures(families_failed_relatedness_check) or hl.empty_dict(hl.tstr, hl.tdict(hl.tstr, hl.tarray(hl.tstr))) ), sex_check=( - { - f.family_guid: { - 'samples': sorted(f.samples.keys()), - 'reasons': reasons, - } - for f, reasons in families_failed_sex_check.items() - } + format_failures(families_failed_sex_check) or hl.empty_dict(hl.tstr, hl.tdict(hl.tstr, hl.tarray(hl.tstr))) ), ), diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index 5ac04c278..a5ed24799 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -1,3 +1,4 @@ +import json import shutil from unittest.mock import Mock, patch @@ -10,12 +11,16 @@ from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import ( WriteRemappedAndSubsettedCallsetTask, ) +from v03_pipeline.lib.tasks.write_validation_errors_for_run import ( + WriteValidationErrorsForRunTask, +) from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv' TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv' TEST_PEDIGREE_4 = 'v03_pipeline/var/test/pedigrees/test_pedigree_4.tsv' +TEST_PEDIGREE_7 = 'v03_pipeline/var/test/pedigrees/test_pedigree_7.tsv' TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht' TEST_RELATEDNESS_CHECK_1 = ( 'v03_pipeline/var/test/relatedness_check/test_relatedness_check_1.ht' @@ -179,3 +184,159 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( ), ], ) + + @patch('v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset.FeatureFlag') + def test_write_remapped_and_subsetted_callset_task_all_families_failed( + self, + mock_ff: Mock, + ) -> None: + mock_ff.CHECK_SEX_AND_RELATEDNESS = True + worker = luigi.worker.Worker() + wrsc_task = WriteRemappedAndSubsettedCallsetTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + run_id=TEST_RUN_ID, + sample_type=SampleType.WGS, + callset_path=TEST_VCF, + project_guids=['R0114_project4'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_7], + project_i=0, + skip_validation=True, + ) + worker.add(wrsc_task) + worker.run() + self.assertFalse(wrsc_task.complete()) + write_validation_errors_task = WriteValidationErrorsForRunTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WES, + callset_path=TEST_VCF, + project_guids=['R0114_project4'], + skip_validation=True, + run_id=TEST_RUN_ID, + ) + self.assertTrue(write_validation_errors_task.complete()) + with write_validation_errors_task.output().open('r') as f: + self.assertDictEqual( + json.load(f), + { + 'project_guids': [ + 'R0114_project4', + ], + 'error_messages': [ + 'All families failed validation checks', + ], + 'failed_family_samples': { + 'missing_samples': { + 'efg_1': { + 'samples': [ + 'NA99999_1', + ], + 'reasons': [ + "Missing samples: {'NA99999_1'}", + ], + }, + }, + 'relatedness_check': {}, + 'sex_check': { + '789_1': { + 'samples': [ + 'NA20875_1', + ], + 'reasons': [ + 'Sample NA20875_1 has pedigree sex M but imputed sex F', + ], + }, + '456_1': { + 'samples': [ + 'NA20870_1', + ], + 'reasons': [ + 'Sample NA20870_1 has pedigree sex M but imputed sex F', + ], + }, + '123_1': { + 'samples': [ + 'NA19675_1', + ], + 'reasons': [ + 'Sample NA19675_1 has pedigree sex M but imputed sex F', + ], + }, + 'cde_1': { + 'samples': [ + 'NA20881_1', + ], + 'reasons': [ + 'Sample NA20881_1 has pedigree sex F but imputed sex M', + ], + }, + '901_1': { + 'samples': [ + 'NA20877_1', + ], + 'reasons': [ + 'Sample NA20877_1 has pedigree sex M but imputed sex F', + ], + }, + '678_1': { + 'samples': [ + 'NA20874_1', + ], + 'reasons': [ + 'Sample NA20874_1 has pedigree sex M but imputed sex F', + ], + }, + '345_1': { + 'samples': [ + 'NA19679_1', + ], + 'reasons': [ + 'Sample NA19679_1 has pedigree sex M but imputed sex F', + ], + }, + '890_1': { + 'samples': [ + 'NA20876_1', + ], + 'reasons': [ + 'Sample NA20876_1 has pedigree sex M but imputed sex F', + ], + }, + 'def_1': { + 'samples': [ + 'NA20885_1', + ], + 'reasons': [ + 'Sample NA20885_1 has pedigree sex M but imputed sex F', + ], + }, + '234_1': { + 'samples': [ + 'NA19678_1', + ], + 'reasons': [ + 'Sample NA19678_1 has pedigree sex F but imputed sex M', + ], + }, + 'bcd_1': { + 'samples': [ + 'NA20878_1', + ], + 'reasons': [ + 'Sample NA20878_1 has pedigree sex F but imputed sex M', + ], + }, + '567_1': { + 'samples': [ + 'NA20872_1', + ], + 'reasons': [ + 'Sample NA20872_1 has pedigree sex F but imputed sex M', + ], + }, + }, + }, + }, + ) diff --git a/v03_pipeline/lib/tasks/write_validation_errors_for_run.py b/v03_pipeline/lib/tasks/write_validation_errors_for_run.py index 9149f6158..d99800f6f 100644 --- a/v03_pipeline/lib/tasks/write_validation_errors_for_run.py +++ b/v03_pipeline/lib/tasks/write_validation_errors_for_run.py @@ -1,8 +1,11 @@ import json +from collections.abc import Callable import luigi +import luigi.freezing import luigi.util +from v03_pipeline.lib.misc.validation import SeqrValidationError from v03_pipeline.lib.paths import validation_errors_for_run_path from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.files import GCSorLocalTarget @@ -12,6 +15,7 @@ class WriteValidationErrorsForRunTask(luigi.Task): project_guids = luigi.ListParameter() error_messages = luigi.ListParameter(default=[]) + failed_family_samples = luigi.DictParameter(default={}) def to_single_error_message(self) -> str: with self.output().open('r') as f: @@ -33,6 +37,36 @@ def run(self) -> None: validation_errors_json = { 'project_guids': self.project_guids, 'error_messages': self.error_messages, + 'failed_family_samples': luigi.freezing.recursively_unfreeze( + self.failed_family_samples, + ), } with self.output().open('w') as f: json.dump(validation_errors_json, f) + + +def with_persisted_validation_errors(f: Callable) -> Callable[[Callable], Callable]: + def wrapper(self: luigi.Task): + try: + return f(self) + except SeqrValidationError as e: + if isinstance( + e.args[1], + object, + ): # TODO: improve type checking with a pydantic model/typed dict + write_validation_errors_for_run_task = self.clone( + WriteValidationErrorsForRunTask, + error_messages=[str(e.args[0])], + failed_family_samples=e.args[1]['failed_family_samples'], + ) + else: + write_validation_errors_for_run_task = self.clone( + WriteValidationErrorsForRunTask, + error_messages=[str(e)], + ) + write_validation_errors_for_run_task.run() + raise SeqrValidationError( + write_validation_errors_for_run_task.to_single_error_message(), + ) from None + + return wrapper diff --git a/v03_pipeline/var/test/pedigrees/test_pedigree_7.tsv b/v03_pipeline/var/test/pedigrees/test_pedigree_7.tsv new file mode 100644 index 000000000..fee3a7458 --- /dev/null +++ b/v03_pipeline/var/test/pedigrees/test_pedigree_7.tsv @@ -0,0 +1,14 @@ +Project_GUID Family_GUID Family_ID Individual_ID Paternal_ID Maternal_ID Sex +R0114_project4 123_1 123 NA19675_1 M +R0114_project4 234_1 234 NA19678_1 F +R0114_project4 345_1 345 NA19679_1 M +R0114_project4 456_1 456 NA20870_1 M +R0114_project4 567_1 567 NA20872_1 F +R0114_project4 678_1 678 NA20874_1 M +R0114_project4 789_1 789 NA20875_1 M +R0114_project4 890_1 890 NA20876_1 M +R0114_project4 901_1 901 NA20877_1 M +R0114_project4 bcd_1 bcd NA20878_1 F +R0114_project4 cde_1 cde NA20881_1 F +R0114_project4 def_1 def NA20885_1 M +R0114_project4 efg_1 efg NA99999_1 F From 25db277c99c4d9631d44c7b5229f613a1d24d26c Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Mon, 6 Jan 2025 13:57:30 -0500 Subject: [PATCH 07/10] feat: Add ability to run tasks dataproc. (#948) * Support gcs dirs in rsync * ws * Add create dataproc cluster task * add dataproc * ruff * requirements * still struggling * Gencode refactor to remove gcs * bump reqs * Run dataproc job * lib * running * merge requirements * Flip'em * Better exception handling * Cleaner approach if less generalizable * write a test * Fix tests * lint * Add test for success * refactor to use a base class... better for adding support for multiple jobs * cleanup * ruff * Fix missing mock * Fix flapping test * pr comments --- requirements.txt | 2 +- .../dataproc/base_run_job_on_dataproc.py | 106 ++++++++++++ .../tasks/dataproc/create_dataproc_cluster.py | 36 ++-- .../dataproc/create_dataproc_cluster_test.py | 12 +- v03_pipeline/lib/tasks/dataproc/misc.py | 21 +++ v03_pipeline/lib/tasks/dataproc/misc_test.py | 58 +++++++ .../write_success_file_on_dataproc.py | 22 +++ .../write_success_file_on_dataproc_test.py | 156 ++++++++++++++++++ 8 files changed, 392 insertions(+), 21 deletions(-) create mode 100644 v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py create mode 100644 v03_pipeline/lib/tasks/dataproc/misc.py create mode 100644 v03_pipeline/lib/tasks/dataproc/misc_test.py create mode 100644 v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py create mode 100644 v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc_test.py diff --git a/requirements.txt b/requirements.txt index 86e9f058b..96f6fb725 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --resolver=backtracking requirements.in +# pip-compile requirements.in # aiodns==2.0.0 # via hail diff --git a/v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py b/v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py new file mode 100644 index 000000000..1094d651e --- /dev/null +++ b/v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py @@ -0,0 +1,106 @@ +import time + +import google.api_core.exceptions +import luigi +from google.cloud import dataproc_v1 as dataproc + +from v03_pipeline.lib.logger import get_logger +from v03_pipeline.lib.model import Env +from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import ( + BaseLoadingPipelineParams, +) +from v03_pipeline.lib.tasks.dataproc.create_dataproc_cluster import ( + CreateDataprocClusterTask, +) +from v03_pipeline.lib.tasks.dataproc.misc import get_cluster_name, to_kebab_str_args + +DONE_STATE = 'DONE' +ERROR_STATE = 'ERROR' +SEQR_PIPELINE_RUNNER_BUILD = f'gs://seqr-pipeline-runner-builds/{Env.DEPLOYMENT_TYPE}/{Env.PIPELINE_RUNNER_APP_VERSION}' +TIMEOUT_S = 172800 # 2 days + +logger = get_logger(__name__) + + +@luigi.util.inherits(BaseLoadingPipelineParams) +class BaseRunJobOnDataprocTask(luigi.Task): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.client = dataproc.JobControllerClient( + client_options={ + 'api_endpoint': f'{Env.GCLOUD_REGION}-dataproc.googleapis.com:443', + }, + ) + + @property + def task_name(self): + return self.get_task_family().split('.')[-1] + + @property + def job_id(self): + return f'{self.task_name}-{self.run_id}' + + def requires(self) -> [luigi.Task]: + return [self.clone(CreateDataprocClusterTask)] + + def complete(self) -> bool: + if not self.dataset_type.requires_dataproc: + msg = f'{self.dataset_type} should not require a dataproc job' + raise RuntimeError(msg) + try: + job = self.client.get_job( + request={ + 'project_id': Env.GCLOUD_PROJECT, + 'region': Env.GCLOUD_REGION, + 'job_id': self.job_id, + }, + ) + except google.api_core.exceptions.NotFound: + return False + if job.status.state == ERROR_STATE: + msg = f'Job {self.task_name}-{self.run_id} entered ERROR state' + logger.error(msg) + logger.error(job.status.details) + return job.status.state == DONE_STATE + + def run(self): + operation = self.client.submit_job_as_operation( + request={ + 'project_id': Env.GCLOUD_PROJECT, + 'region': Env.GCLOUD_REGION, + 'job': { + 'reference': { + 'job_id': self.job_id, + }, + 'placement': { + 'cluster_name': get_cluster_name( + self.reference_genome, + self.run_id, + ), + }, + 'pyspark_job': { + 'main_python_file_uri': f'{SEQR_PIPELINE_RUNNER_BUILD}/bin/run_task.py', + 'args': [ + self.task_name, + '--local-scheduler', + *to_kebab_str_args(self), + ], + 'python_file_uris': [ + f'{SEQR_PIPELINE_RUNNER_BUILD}/pyscripts.zip', + ], + }, + }, + }, + ) + wait_s = 0 + while wait_s < TIMEOUT_S: + if operation.done(): + operation.result() # Will throw on failure! + msg = f'Finished {self.job_id}' + logger.info(msg) + break + logger.info( + f'Waiting for job completion {self.job_id}', + ) + time.sleep(3) + wait_s += 3 diff --git a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py index 27693c5d3..6fce572a0 100644 --- a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py +++ b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py @@ -1,5 +1,6 @@ import time +import google.api_core.exceptions import hail as hl import luigi from google.cloud import dataproc_v1 as dataproc @@ -11,13 +12,15 @@ from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import ( BaseLoadingPipelineParams, ) +from v03_pipeline.lib.tasks.dataproc.misc import get_cluster_name -CLUSTER_NAME_PREFIX = 'pipeline-runner' DEBIAN_IMAGE = '2.1.33-debian11' +ERROR_STATE = 'ERROR' HAIL_VERSION = hl.version().split('-')[0] INSTANCE_TYPE = 'n1-highmem-8' PKGS = '|'.join(pip_freeze.freeze()) -SUCCESS_STATE = 'RUNNING' +RUNNING_STATE = 'RUNNING' +TIMEOUT_S = 900 logger = get_logger(__name__) @@ -26,7 +29,7 @@ def get_cluster_config(reference_genome: ReferenceGenome, run_id: str): service_account_credentials = get_service_account_credentials() return { 'project_id': Env.GCLOUD_PROJECT, - 'cluster_name': f'{CLUSTER_NAME_PREFIX}-{reference_genome.value.lower()}-{run_id}', + 'cluster_name': get_cluster_name(reference_genome, run_id), # Schema found at https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig 'config': { 'gce_cluster_config': { @@ -136,27 +139,32 @@ def __init__(self, *args, **kwargs): # https://cloud.google.com/dataproc/docs/tutorials/python-library-example self.client = dataproc.ClusterControllerClient( client_options={ - 'api_endpoint': f'{Env.GCLOUD_REGION}-dataproc.googleapis.com:443'.format( - Env.GCLOUD_REGION, - ), + 'api_endpoint': f'{Env.GCLOUD_REGION}-dataproc.googleapis.com:443', }, ) def complete(self) -> bool: if not self.dataset_type.requires_dataproc: - return True + msg = f'{self.dataset_type} should not require a dataproc cluster' + raise RuntimeError(msg) try: - client = self.client.get_cluster( + cluster = self.client.get_cluster( request={ 'project_id': Env.GCLOUD_PROJECT, 'region': Env.GCLOUD_REGION, - 'cluster_name': f'{CLUSTER_NAME_PREFIX}-{self.reference_genome.value.lower()}', + 'cluster_name': get_cluster_name( + self.reference_genome, + self.run_id, + ), }, ) - except Exception: # noqa: BLE001 + except google.api_core.exceptions.NotFound: return False - else: - return client.status.state == SUCCESS_STATE + if cluster.status.state == ERROR_STATE: + msg = f'Cluster {cluster.cluster_name} entered ERROR state' + logger.error(msg) + # This will return False when the cluster is "CREATING" + return cluster.status.state == RUNNING_STATE def run(self): operation = self.client.create_cluster( @@ -166,7 +174,8 @@ def run(self): 'cluster': get_cluster_config(self.reference_genome, self.run_id), }, ) - while True: + wait_s = 0 + while wait_s < TIMEOUT_S: if operation.done(): result = operation.result() # Will throw on failure! msg = f'Created cluster {result.cluster_name} with cluster uuid: {result.cluster_uuid}' @@ -174,3 +183,4 @@ def run(self): break logger.info('Waiting for cluster spinup') time.sleep(3) + wait_s += 3 diff --git a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py index c7f4f2958..e3fa8f3fa 100644 --- a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py +++ b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py @@ -28,15 +28,12 @@ def test_dataset_type_unsupported( mock_cluster_controller: Mock, _: Mock, ) -> None: - worker = luigi.worker.Worker() task = CreateDataprocClusterTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.MITO, run_id='1', ) - worker.add(task) - worker.run() - self.assertTrue(task.complete()) + self.assertRaises(RuntimeError, task.complete) def test_spinup_cluster_already_exists_failed( self, @@ -45,7 +42,8 @@ def test_spinup_cluster_already_exists_failed( ) -> None: mock_client = mock_cluster_controller.return_value mock_client.get_cluster.return_value = SimpleNamespace( - status=SimpleNamespace(state='FAILED'), + status=SimpleNamespace(state='ERROR'), + cluster_name='abc', ) mock_client.create_cluster.side_effect = ( google.api_core.exceptions.AlreadyExists('cluster exists') @@ -122,7 +120,7 @@ def test_spinup_cluster_doesnt_exist_success( operation = mock_client.create_cluster.return_value operation.done.side_effect = [False, True] operation.result.return_value = SimpleNamespace( - cluster_name='dataproc-cluster-1', + cluster_name='dataproc-cluster-5', cluster_uuid='12345', ) worker = luigi.worker.Worker() @@ -136,6 +134,6 @@ def test_spinup_cluster_doesnt_exist_success( mock_logger.info.assert_has_calls( [ call('Waiting for cluster spinup'), - call('Created cluster dataproc-cluster-1 with cluster uuid: 12345'), + call('Created cluster dataproc-cluster-5 with cluster uuid: 12345'), ], ) diff --git a/v03_pipeline/lib/tasks/dataproc/misc.py b/v03_pipeline/lib/tasks/dataproc/misc.py new file mode 100644 index 000000000..672f86dcc --- /dev/null +++ b/v03_pipeline/lib/tasks/dataproc/misc.py @@ -0,0 +1,21 @@ +import re + +import luigi + +from v03_pipeline.lib.model import ReferenceGenome + +CLUSTER_NAME_PREFIX = 'pipeline-runner' + + +def get_cluster_name(reference_genome: ReferenceGenome, run_id: str): + return f'{CLUSTER_NAME_PREFIX}-{reference_genome.value.lower()}-{run_id}' + + +def snake_to_kebab_arg(snake_string: str) -> str: + return '--' + re.sub(r'\_', '-', snake_string).lower() + + +def to_kebab_str_args(task: luigi.Task): + return [ + e for k, v in task.to_str_params().items() for e in (snake_to_kebab_arg(k), v) + ] diff --git a/v03_pipeline/lib/tasks/dataproc/misc_test.py b/v03_pipeline/lib/tasks/dataproc/misc_test.py new file mode 100644 index 000000000..335cacbf7 --- /dev/null +++ b/v03_pipeline/lib/tasks/dataproc/misc_test.py @@ -0,0 +1,58 @@ +import unittest +from unittest.mock import Mock, patch + +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.tasks.dataproc.misc import to_kebab_str_args +from v03_pipeline.lib.tasks.dataproc.write_success_file_on_dataproc import ( + WriteSuccessFileOnDataprocTask, +) + + +@patch( + 'v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc.dataproc.JobControllerClient', +) +class MiscTest(unittest.TestCase): + def test_to_kebab_str_args(self, _: Mock): + t = WriteSuccessFileOnDataprocTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path='test_callset', + project_guids=['R0113_test_project'], + project_remap_paths=['test_remap'], + project_pedigree_paths=['test_pedigree'], + run_id='a_misc_run', + ) + self.assertListEqual( + to_kebab_str_args(t), + [ + '--reference-genome', + 'GRCh38', + '--dataset-type', + 'SNV_INDEL', + '--run-id', + 'a_misc_run', + '--sample-type', + 'WGS', + '--callset-path', + 'test_callset', + '--project-guids', + '["R0113_test_project"]', + '--project-remap-paths', + '["test_remap"]', + '--project-pedigree-paths', + '["test_pedigree"]', + '--ignore-missing-samples-when-remapping', + 'False', + '--skip-check-sex-and-relatedness', + 'False', + '--skip-expect-filters', + 'False', + '--skip-expect-tdr-metrics', + 'False', + '--skip-validation', + 'False', + '--is-new-gcnv-joint-call', + 'False', + ], + ) diff --git a/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py b/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py new file mode 100644 index 000000000..0c48c344f --- /dev/null +++ b/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py @@ -0,0 +1,22 @@ +import luigi + +from v03_pipeline.lib.paths import pipeline_run_success_file_path +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, +) +from v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc import ( + BaseRunJobOnDataprocTask, +) +from v03_pipeline.lib.tasks.files import GCSorLocalTarget + + +@luigi.util.inherits(BaseLoadingRunParams) +class WriteSuccessFileOnDataprocTask(BaseRunJobOnDataprocTask): + def output(self) -> luigi.Target: + return GCSorLocalTarget( + pipeline_run_success_file_path( + self.reference_genome, + self.dataset_type, + self.run_id, + ), + ) diff --git a/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc_test.py b/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc_test.py new file mode 100644 index 000000000..62cb4d678 --- /dev/null +++ b/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc_test.py @@ -0,0 +1,156 @@ +import unittest +from types import SimpleNamespace +from unittest.mock import Mock, call, patch + +import google.api_core.exceptions +import luigi + +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.tasks.dataproc.write_success_file_on_dataproc import ( + WriteSuccessFileOnDataprocTask, +) +from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask + + +@patch( + 'v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc.CreateDataprocClusterTask', +) +@patch( + 'v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc.dataproc.JobControllerClient', +) +class WriteSuccessFileOnDataprocTaskTest(unittest.TestCase): + @patch('v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc.logger') + def test_job_already_exists_failed( + self, + mock_logger: Mock, + mock_job_controller_client: Mock, + mock_create_dataproc_cluster: Mock, + ) -> None: + mock_create_dataproc_cluster.return_value = MockCompleteTask() + mock_client = mock_job_controller_client.return_value + mock_client.get_job.return_value = SimpleNamespace( + status=SimpleNamespace( + state='ERROR', + details='Google Cloud Dataproc Agent reports job failure. If logs are available, they can be found at...', + ), + ) + mock_client.submit_job_as_operation.side_effect = ( + google.api_core.exceptions.AlreadyExists('job exists') + ) + worker = luigi.worker.Worker() + task = WriteSuccessFileOnDataprocTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path='test_callset', + project_guids=['R0113_test_project'], + project_remap_paths=['test_remap'], + project_pedigree_paths=['test_pedigree'], + run_id='manual__2024-04-03', + ) + worker.add(task) + worker.run() + self.assertFalse(task.complete()) + mock_logger.error.assert_has_calls( + [ + call( + 'Job WriteSuccessFileOnDataprocTask-manual__2024-04-03 entered ERROR state', + ), + ], + ) + + def test_job_already_exists_success( + self, + mock_job_controller_client: Mock, + mock_create_dataproc_cluster: Mock, + ) -> None: + mock_create_dataproc_cluster.return_value = MockCompleteTask() + mock_client = mock_job_controller_client.return_value + mock_client.get_job.return_value = SimpleNamespace( + status=SimpleNamespace(state='DONE'), + ) + worker = luigi.worker.Worker() + task = WriteSuccessFileOnDataprocTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path='test_callset', + project_guids=['R0113_test_project'], + project_remap_paths=['test_remap'], + project_pedigree_paths=['test_pedigree'], + run_id='manual__2024-04-04', + ) + worker.add(task) + worker.run() + self.assertTrue(task.complete()) + + @patch('v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc.logger') + def test_job_failed( + self, + mock_logger: Mock, + mock_job_controller_client: Mock, + mock_create_dataproc_cluster: Mock, + ) -> None: + mock_create_dataproc_cluster.return_value = MockCompleteTask() + mock_client = mock_job_controller_client.return_value + mock_client.get_job.side_effect = google.api_core.exceptions.NotFound( + 'job not found', + ) + operation = mock_client.submit_job_as_operation.return_value + operation.done.side_effect = [False, True] + operation.result.side_effect = Exception( + 'FailedPrecondition: 400 Job failed with message', + ) + worker = luigi.worker.Worker() + task = WriteSuccessFileOnDataprocTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path='test_callset', + project_guids=['R0113_test_project'], + project_remap_paths=['test_remap'], + project_pedigree_paths=['test_pedigree'], + run_id='manual__2024-04-05', + ) + worker.add(task) + worker.run() + self.assertFalse(task.complete()) + mock_logger.info.assert_has_calls( + [ + call( + 'Waiting for job completion WriteSuccessFileOnDataprocTask-manual__2024-04-05', + ), + ], + ) + + def test_job_success( + self, + mock_job_controller_client: Mock, + mock_create_dataproc_cluster: Mock, + ) -> None: + mock_create_dataproc_cluster.return_value = MockCompleteTask() + mock_client = mock_job_controller_client.return_value + mock_client.get_job.side_effect = [ + google.api_core.exceptions.NotFound( + 'job not found', + ), + SimpleNamespace( + status=SimpleNamespace(state='DONE'), + ), + ] + operation = mock_client.submit_job_as_operation.return_value + operation.done.side_effect = [False, True] + worker = luigi.worker.Worker() + task = WriteSuccessFileOnDataprocTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path='test_callset', + project_guids=['R0113_test_project'], + project_remap_paths=['test_remap'], + project_pedigree_paths=['test_pedigree'], + run_id='manual__2024-04-06', + ) + worker.add(task) + worker.run() + self.assertTrue(task.complete()) From db64b2d12719ddeabe40ad12a78055cc13bc31e6 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 10 Jan 2025 14:02:57 -0500 Subject: [PATCH 08/10] feat: Add rsync task between hail search data/reference data stored in gcs and on local disk. (#1006) * Support gcs dirs in rsync * ws * Add create dataproc cluster task * add dataproc * ruff * requirements * still struggling * Gencode refactor to remove gcs * bump reqs * Run dataproc job * lib * running * merge requirements * Flip'em * Better exception handling * Cleaner approach if less generalizable * write a test * Fix tests * lint * Add test for success * refactor to use a base class... better for adding support for multiple jobs * cleanup * ruff * Fix missing mock * Fix flapping test * first commit * Finish test and cleanup * Allow any order --- v03_pipeline/lib/model/environment.py | 4 + v03_pipeline/lib/paths.py | 62 ++++------- .../reference_datasets/reference_dataset.py | 15 +++ .../tasks/dataproc/rsync_to_seqr_app_dirs.py | 95 ++++++++++++++++ .../dataproc/rsync_to_seqr_app_dirs_test.py | 101 ++++++++++++++++++ 5 files changed, 237 insertions(+), 40 deletions(-) create mode 100644 v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py create mode 100644 v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index 42ae4c658..72c8c5a45 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -50,6 +50,8 @@ GCLOUD_ZONE = os.environ.get('GCLOUD_ZONE') GCLOUD_REGION = os.environ.get('GCLOUD_REGION') PIPELINE_RUNNER_APP_VERSION = os.environ.get('PIPELINE_RUNNER_APP_VERSION', 'latest') +SEQR_APP_HAIL_SEARCH_DATA_DIR = os.environ.get('SEQR_APP_HAIL_SEARCH_DATA_DIR') +SEQR_APP_REFERENCE_DATASETS_DIR = os.environ.get('SEQR_APP_REFERENCE_DATASETS_DIR') @dataclass @@ -71,4 +73,6 @@ class Env: PIPELINE_RUNNER_APP_VERSION: str = PIPELINE_RUNNER_APP_VERSION PRIVATE_REFERENCE_DATASETS_DIR: str = PRIVATE_REFERENCE_DATASETS_DIR REFERENCE_DATASETS_DIR: str = REFERENCE_DATASETS_DIR + SEQR_APP_HAIL_SEARCH_DATA_DIR: str | None = SEQR_APP_HAIL_SEARCH_DATA_DIR + SEQR_APP_REFERENCE_DATASETS_DIR: str | None = SEQR_APP_REFERENCE_DATASETS_DIR VEP_REFERENCE_DATASETS_DIR: str = VEP_REFERENCE_DATASETS_DIR diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 2942b1dae..87aa024cf 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -17,7 +17,7 @@ ) -def _pipeline_prefix( +def pipeline_prefix( root: str, reference_genome: ReferenceGenome, dataset_type: DatasetType, @@ -36,38 +36,15 @@ def _pipeline_prefix( ) -def _v03_reference_data_prefix( - access_control: AccessControl, - reference_genome: ReferenceGenome, - dataset_type: DatasetType, -) -> str: - root = ( - Env.PRIVATE_REFERENCE_DATASETS_DIR - if access_control == AccessControl.PRIVATE - else Env.REFERENCE_DATASETS_DIR - ) - if FeatureFlag.INCLUDE_PIPELINE_VERSION_IN_PREFIX: - return os.path.join( - root, - PipelineVersion.V03.value, - reference_genome.value, - dataset_type.value, - ) - return os.path.join( - root, - reference_genome.value, - dataset_type.value, - ) - - def _v03_reference_dataset_prefix( + root: str, access_control: AccessControl, reference_genome: ReferenceGenome, ) -> str: root = ( Env.PRIVATE_REFERENCE_DATASETS_DIR if access_control == AccessControl.PRIVATE - else Env.REFERENCE_DATASETS_DIR + else root ) if FeatureFlag.INCLUDE_PIPELINE_VERSION_IN_PREFIX: return os.path.join( @@ -88,7 +65,7 @@ def family_table_path( family_guid: str, ) -> str: return os.path.join( - _pipeline_prefix( + pipeline_prefix( Env.HAIL_SEARCH_DATA_DIR, reference_genome, dataset_type, @@ -104,7 +81,7 @@ def tdr_metrics_dir( dataset_type: DatasetType, ) -> str: return os.path.join( - _pipeline_prefix( + pipeline_prefix( Env.LOADING_DATASETS_DIR, reference_genome, dataset_type, @@ -130,7 +107,7 @@ def imported_callset_path( callset_path: str, ) -> str: return os.path.join( - _pipeline_prefix( + pipeline_prefix( Env.LOADING_DATASETS_DIR, reference_genome, dataset_type, @@ -177,7 +154,7 @@ def project_table_path( project_guid: str, ) -> str: return os.path.join( - _pipeline_prefix( + pipeline_prefix( Env.HAIL_SEARCH_DATA_DIR, reference_genome, dataset_type, @@ -194,7 +171,7 @@ def relatedness_check_table_path( callset_path: str, ) -> str: return os.path.join( - _pipeline_prefix( + pipeline_prefix( Env.LOADING_DATASETS_DIR, reference_genome, dataset_type, @@ -210,7 +187,7 @@ def relatedness_check_tsv_path( callset_path: str, ) -> str: return os.path.join( - _pipeline_prefix( + pipeline_prefix( Env.LOADING_DATASETS_DIR, reference_genome, dataset_type, @@ -227,7 +204,7 @@ def remapped_and_subsetted_callset_path( project_guid: str, ) -> str: return os.path.join( - _pipeline_prefix( + pipeline_prefix( Env.LOADING_DATASETS_DIR, reference_genome, dataset_type, @@ -243,7 +220,7 @@ def lookup_table_path( dataset_type: DatasetType, ) -> str: return os.path.join( - _pipeline_prefix( + pipeline_prefix( Env.HAIL_SEARCH_DATA_DIR, reference_genome, dataset_type, @@ -257,7 +234,7 @@ def runs_path( dataset_type: DatasetType, ) -> str: return os.path.join( - _pipeline_prefix( + pipeline_prefix( Env.HAIL_SEARCH_DATA_DIR, reference_genome, dataset_type, @@ -272,7 +249,7 @@ def sex_check_table_path( callset_path: str, ) -> str: return os.path.join( - _pipeline_prefix( + pipeline_prefix( Env.LOADING_DATASETS_DIR, reference_genome, dataset_type, @@ -306,6 +283,7 @@ def valid_reference_dataset_path( ) -> str | None: return os.path.join( _v03_reference_dataset_prefix( + Env.REFERENCE_DATASETS_DIR, reference_dataset.access_control, reference_genome, ), @@ -318,9 +296,13 @@ def valid_reference_dataset_query_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, reference_dataset_query: ReferenceDatasetQuery, + root=None, ) -> str | None: + if not root: + root = Env.REFERENCE_DATASETS_DIR return os.path.join( _v03_reference_dataset_prefix( + root, reference_dataset_query.access_control, reference_genome, ), @@ -334,7 +316,7 @@ def variant_annotations_table_path( dataset_type: DatasetType, ) -> str: return os.path.join( - _pipeline_prefix( + pipeline_prefix( Env.HAIL_SEARCH_DATA_DIR, reference_genome, dataset_type, @@ -348,7 +330,7 @@ def variant_annotations_vcf_path( dataset_type: DatasetType, ) -> str: return os.path.join( - _pipeline_prefix( + pipeline_prefix( Env.HAIL_SEARCH_DATA_DIR, reference_genome, dataset_type, @@ -386,7 +368,7 @@ def project_remap_path( project_guid: str, ) -> str: return os.path.join( - _pipeline_prefix( + pipeline_prefix( Env.LOADING_DATASETS_DIR, reference_genome, dataset_type, @@ -404,7 +386,7 @@ def project_pedigree_path( project_guid: str, ) -> str: return os.path.join( - _pipeline_prefix( + pipeline_prefix( Env.LOADING_DATASETS_DIR, reference_genome, dataset_type, diff --git a/v03_pipeline/lib/reference_datasets/reference_dataset.py b/v03_pipeline/lib/reference_datasets/reference_dataset.py index 445e191aa..0ca80f8cb 100644 --- a/v03_pipeline/lib/reference_datasets/reference_dataset.py +++ b/v03_pipeline/lib/reference_datasets/reference_dataset.py @@ -166,6 +166,21 @@ class ReferenceDatasetQuery(BaseReferenceDataset, str, Enum): clinvar_path_variants = 'clinvar_path_variants' high_af_variants = 'high_af_variants' + @classmethod + def for_reference_genome_dataset_type( + cls, + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + ) -> set['ReferenceDatasetQuery']: + return { + dataset + for dataset in super().for_reference_genome_dataset_type( + reference_genome, + dataset_type, + ) + if isinstance(dataset, cls) + } + @property def requires(self) -> ReferenceDataset: return { diff --git a/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py new file mode 100644 index 000000000..1f5e2e48c --- /dev/null +++ b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py @@ -0,0 +1,95 @@ +import os +import subprocess + +import luigi + +from v03_pipeline.lib.model import Env +from v03_pipeline.lib.paths import pipeline_prefix, valid_reference_dataset_query_path +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDatasetQuery +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, +) + + +def hail_search_value(value: str) -> str: + return value.replace('SV', 'SV_WGS').replace( + 'GCNV', + 'SV_WES', + ) + + +def rsync_command(src_path: str, dst_path: str) -> list[str]: + return [ + '/bin/bash', + '-cx', + f'mkdir -p {dst_path} && gsutil -qm rsync -rd -x .*runs.* {src_path} {dst_path} && sync {dst_path}', + ] + + +@luigi.util.inherits(BaseLoadingRunParams) +class RsyncToSeqrAppDirsTask(luigi.Task): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.done = False + + def output(self) -> None: + return None + + def complete(self) -> bool: + return self.done + + def run(self) -> None: + if not ( + Env.SEQR_APP_HAIL_SEARCH_DATA_DIR and Env.SEQR_APP_REFERENCE_DATASETS_DIR + ): + self.done = True + return + + if not ( + Env.HAIL_SEARCH_DATA_DIR.startswith('gs://') + and Env.REFERENCE_DATASETS_DIR.startswith('gs://') + ): + msg = 'Overridden HAIL_SEARCH_DATA_DIR and REFERENCE_DATASETS_DIR must be Google Cloud buckets.' + raise RuntimeError(msg) + + # Sync Pipeline Tables + src_path = pipeline_prefix( + Env.HAIL_SEARCH_DATA_DIR, + self.reference_genome, + self.dataset_type, + ) + dst_path = hail_search_value( + pipeline_prefix( + Env.SEQR_APP_HAIL_SEARCH_DATA_DIR, + self.reference_genome, + self.dataset_type, + ), + ) + subprocess.call( + rsync_command(src_path, dst_path), # noqa: S603 + ) + + # Sync RDQs + for query in ReferenceDatasetQuery.for_reference_genome_dataset_type( + self.reference_genome, + self.dataset_type, + ): + src_path = valid_reference_dataset_query_path( + self.reference_genome, + self.dataset_type, + query, + ) + dst_path = os.path.join( + hail_search_value( + valid_reference_dataset_query_path( + self.reference_genome, + self.dataset_type, + query, + Env.SEQR_APP_REFERENCE_DATASETS_DIR, + ), + ), + ) + subprocess.call( + rsync_command(src_path, dst_path), # noqa: S603 + ) + self.done = True diff --git a/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py new file mode 100644 index 000000000..225cfa93c --- /dev/null +++ b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py @@ -0,0 +1,101 @@ +import unittest +from unittest.mock import Mock, call, patch + +import luigi + +from v03_pipeline.lib.model import ( + DatasetType, + Env, + FeatureFlag, + ReferenceGenome, + SampleType, +) +from v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs import ( + RsyncToSeqrAppDirsTask, +) + + +class RsyncToSeqrAppDirsTaskTest(unittest.TestCase): + @patch('v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs.subprocess') + def test_rsync_to_seqr_app_dirs_no_sync( + self, + mock_subprocess: Mock, + ) -> None: + worker = luigi.worker.Worker() + task = RsyncToSeqrAppDirsTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path='test_callset', + project_guids=['R0113_test_project'], + project_remap_paths=['test_remap'], + project_pedigree_paths=['test_pedigree'], + run_id='manual__2024-04-01', + ) + worker.add(task) + worker.run() + self.assertTrue(task.complete()) + mock_subprocess.call.assert_not_called() + + @patch('v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs.subprocess') + @patch.object(Env, 'HAIL_SEARCH_DATA_DIR', 'gs://test-hail-search-dir') + @patch.object(Env, 'REFERENCE_DATASETS_DIR', 'gs://test-reference-data-dir') + @patch.object( + Env, + 'SEQR_APP_HAIL_SEARCH_DATA_DIR', + '/var/seqr/seqr-hail-search-data', + ) + @patch.object( + Env, + 'SEQR_APP_REFERENCE_DATASETS_DIR', + '/var/seqr/seqr-reference-data', + ) + @patch.object( + FeatureFlag, + 'INCLUDE_PIPELINE_VERSION_IN_PREFIX', + False, + ) + def test_rsync_to_seqr_app_dirs_sync( + self, + mock_subprocess: Mock, + ) -> None: + worker = luigi.worker.Worker() + task = RsyncToSeqrAppDirsTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path='test_callset', + project_guids=['R0113_test_project'], + project_remap_paths=['test_remap'], + project_pedigree_paths=['test_pedigree'], + run_id='manual__2024-04-02', + ) + worker.add(task) + worker.run() + self.assertTrue(task.complete()) + mock_subprocess.call.assert_has_calls( + [ + call( + [ + '/bin/bash', + '-cx', + 'mkdir -p /var/seqr/seqr-hail-search-data/GRCh38/SNV_INDEL && gsutil -qm rsync -rd -x .*runs.* gs://test-hail-search-dir/GRCh38/SNV_INDEL /var/seqr/seqr-hail-search-data/GRCh38/SNV_INDEL && sync /var/seqr/seqr-hail-search-data/GRCh38/SNV_INDEL', + ], + ), + call( + [ + '/bin/bash', + '-cx', + 'mkdir -p /var/seqr/seqr-reference-data/GRCh38/SNV_INDEL/high_af_variants.ht && gsutil -qm rsync -rd -x .*runs.* gs://test-reference-data-dir/GRCh38/SNV_INDEL/high_af_variants.ht /var/seqr/seqr-reference-data/GRCh38/SNV_INDEL/high_af_variants.ht && sync /var/seqr/seqr-reference-data/GRCh38/SNV_INDEL/high_af_variants.ht', + ], + ), + call( + [ + '/bin/bash', + '-cx', + 'mkdir -p /var/seqr/seqr-reference-data/GRCh38/SNV_INDEL/clinvar_path_variants.ht && gsutil -qm rsync -rd -x .*runs.* gs://test-reference-data-dir/GRCh38/SNV_INDEL/clinvar_path_variants.ht /var/seqr/seqr-reference-data/GRCh38/SNV_INDEL/clinvar_path_variants.ht && sync /var/seqr/seqr-reference-data/GRCh38/SNV_INDEL/clinvar_path_variants.ht', + ], + ), + ], + any_order=True, + ) From c56aa8f8b0335a7b6f81a327faaac83feca8cc0b Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 10 Jan 2025 15:45:16 -0500 Subject: [PATCH 09/10] refactor + chore: shuffle end of pipeline steps to enable dataproc for local users (#1009) * Support gcs dirs in rsync * ws * Add create dataproc cluster task * add dataproc * ruff * requirements * still struggling * Gencode refactor to remove gcs * bump reqs * Run dataproc job * lib * running * merge requirements * Flip'em * Better exception handling * Cleaner approach if less generalizable * write a test * Fix tests * lint * Add test for success * refactor to use a base class... better for adding support for multiple jobs * cleanup * ruff * Fix missing mock * Fix flapping test * first commit * Finish test and cleanup * Allow any order * First commit * ruff * ruff * finish off * A few minor tweaks --- v03_pipeline/lib/model/feature_flag.py | 2 ++ v03_pipeline/lib/tasks/__init__.py | 2 ++ .../dataproc/base_run_job_on_dataproc.py | 10 +++--- v03_pipeline/lib/tasks/dataproc/misc_test.py | 6 ++-- .../tasks/dataproc/rsync_to_seqr_app_dirs.py | 6 ++++ .../dataproc/rsync_to_seqr_app_dirs_test.py | 11 +++++++ .../dataproc/run_pipeline_on_dataproc.py | 16 ++++++++++ ...st.py => run_pipeline_on_dataproc_test.py} | 16 +++++----- .../write_success_file_on_dataproc.py | 22 ------------- v03_pipeline/lib/tasks/run_pipeline.py | 32 +++++++++++++++++++ v03_pipeline/lib/tasks/write_success_file.py | 31 +++++++----------- .../lib/tasks/write_success_file_test.py | 17 ++-------- 12 files changed, 99 insertions(+), 72 deletions(-) create mode 100644 v03_pipeline/lib/tasks/dataproc/run_pipeline_on_dataproc.py rename v03_pipeline/lib/tasks/dataproc/{write_success_file_on_dataproc_test.py => run_pipeline_on_dataproc_test.py} (91%) delete mode 100644 v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py create mode 100644 v03_pipeline/lib/tasks/run_pipeline.py diff --git a/v03_pipeline/lib/model/feature_flag.py b/v03_pipeline/lib/model/feature_flag.py index 0b57e8643..8a033f60f 100644 --- a/v03_pipeline/lib/model/feature_flag.py +++ b/v03_pipeline/lib/model/feature_flag.py @@ -11,6 +11,7 @@ INCLUDE_PIPELINE_VERSION_IN_PREFIX = ( os.environ.get('INCLUDE_PIPELINE_VERSION_IN_PREFIX') == '1' ) +RUN_PIPELINE_ON_DATAPROC = os.environ.get('RUN_PIPELINE_ON_DATAPROC') == '1' SHOULD_TRIGGER_HAIL_BACKEND_RELOAD = ( os.environ.get('SHOULD_TRIGGER_HAIL_BACKEND_RELOAD') == '1' ) @@ -23,4 +24,5 @@ class FeatureFlag: EXPECT_TDR_METRICS: bool = EXPECT_TDR_METRICS EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS INCLUDE_PIPELINE_VERSION_IN_PREFIX: bool = INCLUDE_PIPELINE_VERSION_IN_PREFIX + RUN_PIPELINE_ON_DATAPROC: bool = RUN_PIPELINE_ON_DATAPROC SHOULD_TRIGGER_HAIL_BACKEND_RELOAD: bool = SHOULD_TRIGGER_HAIL_BACKEND_RELOAD diff --git a/v03_pipeline/lib/tasks/__init__.py b/v03_pipeline/lib/tasks/__init__.py index 1b986f668..76832c5c5 100644 --- a/v03_pipeline/lib/tasks/__init__.py +++ b/v03_pipeline/lib/tasks/__init__.py @@ -11,6 +11,7 @@ from v03_pipeline.lib.tasks.reference_data.update_variant_annotations_table_with_updated_reference_dataset import ( UpdateVariantAnnotationsTableWithUpdatedReferenceDataset, ) +from v03_pipeline.lib.tasks.run_pipeline import RunPipelineTask from v03_pipeline.lib.tasks.update_lookup_table import ( UpdateLookupTableTask, ) @@ -46,6 +47,7 @@ 'DeleteProjectTablesTask', 'MigrateAllLookupTablesTask', 'MigrateAllVariantAnnotationsTablesTask', + 'RunPipelineTask', 'UpdateProjectTableTask', 'UpdateProjectTablesWithDeletedFamiliesTask', 'UpdateLookupTableTask', diff --git a/v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py b/v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py index 1094d651e..b8162392e 100644 --- a/v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py +++ b/v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py @@ -33,12 +33,12 @@ def __init__(self, *args, **kwargs): ) @property - def task_name(self): - return self.get_task_family().split('.')[-1] + def task(self): + raise NotImplementedError @property def job_id(self): - return f'{self.task_name}-{self.run_id}' + return f'{self.task.task_family}-{self.run_id}' def requires(self) -> [luigi.Task]: return [self.clone(CreateDataprocClusterTask)] @@ -58,7 +58,7 @@ def complete(self) -> bool: except google.api_core.exceptions.NotFound: return False if job.status.state == ERROR_STATE: - msg = f'Job {self.task_name}-{self.run_id} entered ERROR state' + msg = f'Job {self.task.task_family}-{self.run_id} entered ERROR state' logger.error(msg) logger.error(job.status.details) return job.status.state == DONE_STATE @@ -81,7 +81,7 @@ def run(self): 'pyspark_job': { 'main_python_file_uri': f'{SEQR_PIPELINE_RUNNER_BUILD}/bin/run_task.py', 'args': [ - self.task_name, + self.task.task_family, '--local-scheduler', *to_kebab_str_args(self), ], diff --git a/v03_pipeline/lib/tasks/dataproc/misc_test.py b/v03_pipeline/lib/tasks/dataproc/misc_test.py index 335cacbf7..22729bedc 100644 --- a/v03_pipeline/lib/tasks/dataproc/misc_test.py +++ b/v03_pipeline/lib/tasks/dataproc/misc_test.py @@ -3,8 +3,8 @@ from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType from v03_pipeline.lib.tasks.dataproc.misc import to_kebab_str_args -from v03_pipeline.lib.tasks.dataproc.write_success_file_on_dataproc import ( - WriteSuccessFileOnDataprocTask, +from v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs import ( + RsyncToSeqrAppDirsTask, ) @@ -13,7 +13,7 @@ ) class MiscTest(unittest.TestCase): def test_to_kebab_str_args(self, _: Mock): - t = WriteSuccessFileOnDataprocTask( + t = RsyncToSeqrAppDirsTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, sample_type=SampleType.WGS, diff --git a/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py index 1f5e2e48c..194f27611 100644 --- a/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py +++ b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py @@ -9,6 +9,9 @@ from v03_pipeline.lib.tasks.base.base_loading_run_params import ( BaseLoadingRunParams, ) +from v03_pipeline.lib.tasks.dataproc.run_pipeline_on_dataproc import ( + RunPipelineOnDataprocTask, +) def hail_search_value(value: str) -> str: @@ -38,6 +41,9 @@ def output(self) -> None: def complete(self) -> bool: return self.done + def requires(self) -> luigi.Task: + return self.clone(RunPipelineOnDataprocTask) + def run(self) -> None: if not ( Env.SEQR_APP_HAIL_SEARCH_DATA_DIR and Env.SEQR_APP_REFERENCE_DATASETS_DIR diff --git a/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py index 225cfa93c..e4894e1db 100644 --- a/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py +++ b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py @@ -13,14 +13,20 @@ from v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs import ( RsyncToSeqrAppDirsTask, ) +from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask class RsyncToSeqrAppDirsTaskTest(unittest.TestCase): + @patch( + 'v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs.RunPipelineOnDataprocTask', + ) @patch('v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs.subprocess') def test_rsync_to_seqr_app_dirs_no_sync( self, mock_subprocess: Mock, + mock_run_pipeline_task: Mock, ) -> None: + mock_run_pipeline_task.return_value = MockCompleteTask() worker = luigi.worker.Worker() task = RsyncToSeqrAppDirsTask( reference_genome=ReferenceGenome.GRCh38, @@ -37,6 +43,9 @@ def test_rsync_to_seqr_app_dirs_no_sync( self.assertTrue(task.complete()) mock_subprocess.call.assert_not_called() + @patch( + 'v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs.RunPipelineOnDataprocTask', + ) @patch('v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs.subprocess') @patch.object(Env, 'HAIL_SEARCH_DATA_DIR', 'gs://test-hail-search-dir') @patch.object(Env, 'REFERENCE_DATASETS_DIR', 'gs://test-reference-data-dir') @@ -58,7 +67,9 @@ def test_rsync_to_seqr_app_dirs_no_sync( def test_rsync_to_seqr_app_dirs_sync( self, mock_subprocess: Mock, + mock_run_pipeline_task: Mock, ) -> None: + mock_run_pipeline_task.return_value = MockCompleteTask() worker = luigi.worker.Worker() task = RsyncToSeqrAppDirsTask( reference_genome=ReferenceGenome.GRCh38, diff --git a/v03_pipeline/lib/tasks/dataproc/run_pipeline_on_dataproc.py b/v03_pipeline/lib/tasks/dataproc/run_pipeline_on_dataproc.py new file mode 100644 index 000000000..15ee74fb4 --- /dev/null +++ b/v03_pipeline/lib/tasks/dataproc/run_pipeline_on_dataproc.py @@ -0,0 +1,16 @@ +import luigi + +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, +) +from v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc import ( + BaseRunJobOnDataprocTask, +) +from v03_pipeline.lib.tasks.run_pipeline import RunPipelineTask + + +@luigi.util.inherits(BaseLoadingRunParams) +class RunPipelineOnDataprocTask(BaseRunJobOnDataprocTask): + @property + def task(self) -> luigi.Task: + return RunPipelineTask diff --git a/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc_test.py b/v03_pipeline/lib/tasks/dataproc/run_pipeline_on_dataproc_test.py similarity index 91% rename from v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc_test.py rename to v03_pipeline/lib/tasks/dataproc/run_pipeline_on_dataproc_test.py index 62cb4d678..abde6833f 100644 --- a/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc_test.py +++ b/v03_pipeline/lib/tasks/dataproc/run_pipeline_on_dataproc_test.py @@ -6,8 +6,8 @@ import luigi from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType -from v03_pipeline.lib.tasks.dataproc.write_success_file_on_dataproc import ( - WriteSuccessFileOnDataprocTask, +from v03_pipeline.lib.tasks.dataproc.run_pipeline_on_dataproc import ( + RunPipelineOnDataprocTask, ) from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask @@ -38,7 +38,7 @@ def test_job_already_exists_failed( google.api_core.exceptions.AlreadyExists('job exists') ) worker = luigi.worker.Worker() - task = WriteSuccessFileOnDataprocTask( + task = RunPipelineOnDataprocTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, sample_type=SampleType.WGS, @@ -54,7 +54,7 @@ def test_job_already_exists_failed( mock_logger.error.assert_has_calls( [ call( - 'Job WriteSuccessFileOnDataprocTask-manual__2024-04-03 entered ERROR state', + 'Job RunPipelineTask-manual__2024-04-03 entered ERROR state', ), ], ) @@ -70,7 +70,7 @@ def test_job_already_exists_success( status=SimpleNamespace(state='DONE'), ) worker = luigi.worker.Worker() - task = WriteSuccessFileOnDataprocTask( + task = RunPipelineOnDataprocTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, sample_type=SampleType.WGS, @@ -102,7 +102,7 @@ def test_job_failed( 'FailedPrecondition: 400 Job failed with message', ) worker = luigi.worker.Worker() - task = WriteSuccessFileOnDataprocTask( + task = RunPipelineOnDataprocTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, sample_type=SampleType.WGS, @@ -118,7 +118,7 @@ def test_job_failed( mock_logger.info.assert_has_calls( [ call( - 'Waiting for job completion WriteSuccessFileOnDataprocTask-manual__2024-04-05', + 'Waiting for job completion RunPipelineTask-manual__2024-04-05', ), ], ) @@ -141,7 +141,7 @@ def test_job_success( operation = mock_client.submit_job_as_operation.return_value operation.done.side_effect = [False, True] worker = luigi.worker.Worker() - task = WriteSuccessFileOnDataprocTask( + task = RunPipelineOnDataprocTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, sample_type=SampleType.WGS, diff --git a/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py b/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py deleted file mode 100644 index 0c48c344f..000000000 --- a/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py +++ /dev/null @@ -1,22 +0,0 @@ -import luigi - -from v03_pipeline.lib.paths import pipeline_run_success_file_path -from v03_pipeline.lib.tasks.base.base_loading_run_params import ( - BaseLoadingRunParams, -) -from v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc import ( - BaseRunJobOnDataprocTask, -) -from v03_pipeline.lib.tasks.files import GCSorLocalTarget - - -@luigi.util.inherits(BaseLoadingRunParams) -class WriteSuccessFileOnDataprocTask(BaseRunJobOnDataprocTask): - def output(self) -> luigi.Target: - return GCSorLocalTarget( - pipeline_run_success_file_path( - self.reference_genome, - self.dataset_type, - self.run_id, - ), - ) diff --git a/v03_pipeline/lib/tasks/run_pipeline.py b/v03_pipeline/lib/tasks/run_pipeline.py new file mode 100644 index 000000000..1711143d2 --- /dev/null +++ b/v03_pipeline/lib/tasks/run_pipeline.py @@ -0,0 +1,32 @@ +import luigi +import luigi.util + +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, +) +from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import ( + UpdateVariantAnnotationsTableWithNewSamplesTask, +) +from v03_pipeline.lib.tasks.write_metadata_for_run import WriteMetadataForRunTask +from v03_pipeline.lib.tasks.write_project_family_tables import ( + WriteProjectFamilyTablesTask, +) + + +@luigi.util.inherits(BaseLoadingRunParams) +class RunPipelineTask(luigi.WrapperTask): + def requires(self): + requirements = [ + self.clone(WriteMetadataForRunTask), + self.clone(UpdateVariantAnnotationsTableWithNewSamplesTask), + ] + return [ + *requirements, + *[ + self.clone( + WriteProjectFamilyTablesTask, + project_i=i, + ) + for i in range(len(self.project_guids)) + ], + ] diff --git a/v03_pipeline/lib/tasks/write_success_file.py b/v03_pipeline/lib/tasks/write_success_file.py index 60c90f81d..2428c6ceb 100644 --- a/v03_pipeline/lib/tasks/write_success_file.py +++ b/v03_pipeline/lib/tasks/write_success_file.py @@ -1,16 +1,16 @@ import luigi import luigi.util +from v03_pipeline.lib.model.feature_flag import FeatureFlag from v03_pipeline.lib.paths import pipeline_run_success_file_path -from v03_pipeline.lib.tasks import WriteProjectFamilyTablesTask from v03_pipeline.lib.tasks.base.base_loading_run_params import ( BaseLoadingRunParams, ) -from v03_pipeline.lib.tasks.files import GCSorLocalTarget -from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import ( - UpdateVariantAnnotationsTableWithNewSamplesTask, +from v03_pipeline.lib.tasks.dataproc.rsync_to_seqr_app_dirs import ( + RsyncToSeqrAppDirsTask, ) -from v03_pipeline.lib.tasks.write_metadata_for_run import WriteMetadataForRunTask +from v03_pipeline.lib.tasks.files import GCSorLocalTarget +from v03_pipeline.lib.tasks.run_pipeline import RunPipelineTask @luigi.util.inherits(BaseLoadingRunParams) @@ -24,21 +24,12 @@ def output(self) -> luigi.Target: ), ) - def requires(self): - requirements = [ - self.clone(WriteMetadataForRunTask), - self.clone(UpdateVariantAnnotationsTableWithNewSamplesTask), - ] - return [ - *requirements, - *[ - self.clone( - WriteProjectFamilyTablesTask, - project_i=i, - ) - for i in range(len(self.project_guids)) - ], - ] + def requires(self) -> luigi.Task: + return ( + self.clone(RsyncToSeqrAppDirsTask) + if FeatureFlag.RUN_PIPELINE_ON_DATAPROC + else self.clone(RunPipelineTask) + ) def run(self): with self.output().open('w') as f: diff --git a/v03_pipeline/lib/tasks/write_success_file_test.py b/v03_pipeline/lib/tasks/write_success_file_test.py index 18df69e86..4949481a6 100644 --- a/v03_pipeline/lib/tasks/write_success_file_test.py +++ b/v03_pipeline/lib/tasks/write_success_file_test.py @@ -10,24 +10,13 @@ class WriteSuccessFileTaskTest(MockedDatarootTestCase): @mock.patch( - 'v03_pipeline.lib.tasks.write_success_file.WriteMetadataForRunTask', - ) - @mock.patch( - 'v03_pipeline.lib.tasks.write_success_file.WriteProjectFamilyTablesTask', - ) - @mock.patch( - 'v03_pipeline.lib.tasks.write_success_file.UpdateVariantAnnotationsTableWithNewSamplesTask', + 'v03_pipeline.lib.tasks.write_success_file.RunPipelineTask', ) def test_write_success_file_task( self, - mock_update_variant_annotations_task, - mock_write_project_fam_tables, - mock_write_metadata_for_run_task, + mock_run_pipeline_task: mock.Mock, ) -> None: - mock_write_metadata_for_run_task.return_value = MockCompleteTask() - mock_update_variant_annotations_task.return_value = MockCompleteTask() - mock_write_project_fam_tables.return_value = MockCompleteTask() - + mock_run_pipeline_task.return_value = MockCompleteTask() worker = luigi.worker.Worker() write_success_file = WriteSuccessFileTask( reference_genome=ReferenceGenome.GRCh38, From b9df3b68734a679839f3965a96a681d7a49d4352 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Mon, 13 Jan 2025 15:40:15 -0500 Subject: [PATCH 10/10] feat: Misaligned ploidy samples (#1013) * Misaligned ploidy samples * regex * Make sure s is a key in tests, update validation errors to support body * unnecessary key * Update v03_pipeline/lib/misc/validation.py Co-authored-by: Daniel Marten <78616802+matren395@users.noreply.github.com> * Update validation_test.py * validation * fix test --------- Co-authored-by: Daniel Marten <78616802+matren395@users.noreply.github.com> --- v03_pipeline/lib/misc/validation.py | 14 +- v03_pipeline/lib/misc/validation_test.py | 418 ++++++++++-------- .../lib/tasks/validate_callset_test.py | 1 - .../tasks/write_validation_errors_for_run.py | 8 +- 4 files changed, 237 insertions(+), 204 deletions(-) diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py index 113e79b53..5d23397be 100644 --- a/v03_pipeline/lib/misc/validation.py +++ b/v03_pipeline/lib/misc/validation.py @@ -151,10 +151,16 @@ def validate_imputed_sex_ploidy( ) ), ) - discrepant_rate = mt.aggregate_cols(hl.agg.fraction(mt.discrepant)) - if discrepant_rate: - msg = f'{discrepant_rate:.2%} of samples have misaligned ploidy with their provided imputed sex.' - raise SeqrValidationError(msg) + discrepant_samples = mt.aggregate_cols( + hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)), + ) + if discrepant_samples: + sorted_discrepant_samples = sorted(discrepant_samples) + msg = f'Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : {sorted_discrepant_samples[:10]}' + raise SeqrValidationError( + msg, + {'imputed_sex_ploidy_failures': sorted_discrepant_samples}, + ) def validate_sample_type( diff --git a/v03_pipeline/lib/misc/validation_test.py b/v03_pipeline/lib/misc/validation_test.py index f32900a99..b1057e1e2 100644 --- a/v03_pipeline/lib/misc/validation_test.py +++ b/v03_pipeline/lib/misc/validation_test.py @@ -18,60 +18,68 @@ def _mt_from_contigs(contigs): - return hl.MatrixTable.from_parts( - rows={ - 'locus': [ - hl.Locus( - contig=contig, - position=1, - reference_genome='GRCh38', - ) - for contig in contigs - ], - }, - cols={'s': ['sample_1']}, - entries={'HL': [[0.0] for _ in range(len(contigs))]}, - ).key_rows_by('locus') - - -class ValidationTest(unittest.TestCase): - def test_validate_allele_type(self) -> None: - mt = hl.MatrixTable.from_parts( + return ( + hl.MatrixTable.from_parts( rows={ 'locus': [ hl.Locus( - contig='chr1', + contig=contig, position=1, reference_genome='GRCh38', - ), - hl.Locus( - contig='chr1', - position=2, - reference_genome='GRCh38', - ), - hl.Locus( - contig='chr1', - position=3, - reference_genome='GRCh38', - ), - hl.Locus( - contig='chr1', - position=4, - reference_genome='GRCh38', - ), - ], - 'alleles': [ - ['A', 'T'], - # NB: star alleles should pass through this validation just fine, - # but are eventually filtered out upstream. - ['A', '*'], - ['A', '-'], - ['A', ''], + ) + for contig in contigs ], }, cols={'s': ['sample_1']}, - entries={'HL': [[0.0], [0.0], [0.0], [0.0]]}, - ).key_rows_by('locus', 'alleles') + entries={'HL': [[0.0] for _ in range(len(contigs))]}, + ) + .key_rows_by('locus') + .key_cols_by('s') + ) + + +class ValidationTest(unittest.TestCase): + def test_validate_allele_type(self) -> None: + mt = ( + hl.MatrixTable.from_parts( + rows={ + 'locus': [ + hl.Locus( + contig='chr1', + position=1, + reference_genome='GRCh38', + ), + hl.Locus( + contig='chr1', + position=2, + reference_genome='GRCh38', + ), + hl.Locus( + contig='chr1', + position=3, + reference_genome='GRCh38', + ), + hl.Locus( + contig='chr1', + position=4, + reference_genome='GRCh38', + ), + ], + 'alleles': [ + ['A', 'T'], + # NB: star alleles should pass through this validation just fine, + # but are eventually filtered out upstream. + ['A', '*'], + ['A', '-'], + ['A', ''], + ], + }, + cols={'s': ['sample_1']}, + entries={'HL': [[0.0], [0.0], [0.0], [0.0]]}, + ) + .key_rows_by('locus', 'alleles') + .key_cols_by('s') + ) self.assertRaisesRegex( SeqrValidationError, "Alleles with invalid AlleleType are present in the callset: \\[\\('A', '-'\\), \\('A', ''\\)\\]", @@ -80,28 +88,32 @@ def test_validate_allele_type(self) -> None: DatasetType.SNV_INDEL, ) - mt = hl.MatrixTable.from_parts( - rows={ - 'locus': [ - hl.Locus( - contig='chr1', - position=1, - reference_genome='GRCh38', - ), - hl.Locus( - contig='chr1', - position=2, - reference_genome='GRCh38', - ), - ], - 'alleles': [ - ['C', ''], - ['A', ''], - ], - }, - cols={'s': ['sample_1']}, - entries={'HL': [[0.0], [0.0]]}, - ).key_rows_by('locus', 'alleles') + mt = ( + hl.MatrixTable.from_parts( + rows={ + 'locus': [ + hl.Locus( + contig='chr1', + position=1, + reference_genome='GRCh38', + ), + hl.Locus( + contig='chr1', + position=2, + reference_genome='GRCh38', + ), + ], + 'alleles': [ + ['C', ''], + ['A', ''], + ], + }, + cols={'s': ['sample_1']}, + entries={'HL': [[0.0], [0.0]]}, + ) + .key_rows_by('locus', 'alleles') + .key_cols_by('s') + ) self.assertRaisesRegex( SeqrValidationError, 'Alleles with invalid allele are present in the callset. This appears to be a GVCF containing records for sites with no variants.', @@ -122,116 +134,128 @@ def test_validate_imputed_sex_ploidy(self) -> None: sex_check_ht = hl.read_table(TEST_SEX_CHECK_1) # All calls on X chromosome are valid - mt = hl.MatrixTable.from_parts( - rows={ - 'locus': [ - hl.Locus( - contig='chrX', - position=1, - reference_genome='GRCh38', - ), - ], - }, - cols={ - 's': [ - female_sample, - male_sample_1, - x0_sample, - xxy_sample, - xyy_sample, - xxx_sample, - ], - }, - entries={ - 'GT': [ - [ - hl.Call(alleles=[0, 0], phased=False), - hl.Call(alleles=[0], phased=False), - hl.Call(alleles=[0, 0], phased=False), # X0 - hl.Call(alleles=[0, 0], phased=False), # XXY - hl.Call(alleles=[0, 0], phased=False), # XYY - hl.Call(alleles=[0, 0], phased=False), # XXX + mt = ( + hl.MatrixTable.from_parts( + rows={ + 'locus': [ + hl.Locus( + contig='chrX', + position=1, + reference_genome='GRCh38', + ), ], - ], - }, - ).key_rows_by('locus') + }, + cols={ + 's': [ + female_sample, + male_sample_1, + x0_sample, + xxy_sample, + xyy_sample, + xxx_sample, + ], + }, + entries={ + 'GT': [ + [ + hl.Call(alleles=[0, 0], phased=False), + hl.Call(alleles=[0], phased=False), + hl.Call(alleles=[0, 0], phased=False), # X0 + hl.Call(alleles=[0, 0], phased=False), # XXY + hl.Call(alleles=[0, 0], phased=False), # XYY + hl.Call(alleles=[0, 0], phased=False), # XXX + ], + ], + }, + ) + .key_rows_by('locus') + .key_cols_by('s') + ) validate_imputed_sex_ploidy(mt, sex_check_ht) # All calls on Y chromosome are valid - mt = hl.MatrixTable.from_parts( - rows={ - 'locus': [ - hl.Locus( - contig='chrY', - position=1, - reference_genome='GRCh38', - ), - ], - }, - cols={ - 's': [ - female_sample, - male_sample_1, - x0_sample, - xxy_sample, - xyy_sample, - xxx_sample, - ], - }, - entries={ - 'GT': [ - [ - hl.missing(hl.tcall), - hl.Call(alleles=[0], phased=False), - hl.missing(hl.tcall), # X0 - hl.Call(alleles=[0, 0], phased=False), # XXY - hl.Call(alleles=[0, 0], phased=False), # XYY - hl.missing(hl.tcall), # XXX + mt = ( + hl.MatrixTable.from_parts( + rows={ + 'locus': [ + hl.Locus( + contig='chrY', + position=1, + reference_genome='GRCh38', + ), ], - ], - }, - ).key_rows_by('locus') + }, + cols={ + 's': [ + female_sample, + male_sample_1, + x0_sample, + xxy_sample, + xyy_sample, + xxx_sample, + ], + }, + entries={ + 'GT': [ + [ + hl.missing(hl.tcall), + hl.Call(alleles=[0], phased=False), + hl.missing(hl.tcall), # X0 + hl.Call(alleles=[0, 0], phased=False), # XXY + hl.Call(alleles=[0, 0], phased=False), # XYY + hl.missing(hl.tcall), # XXX + ], + ], + }, + ) + .key_rows_by('locus') + .key_cols_by('s') + ) validate_imputed_sex_ploidy(mt, sex_check_ht) # Invalid X chromosome case - mt = hl.MatrixTable.from_parts( - rows={ - 'locus': [ - hl.Locus( - contig='chrX', - position=1, - reference_genome='GRCh38', - ), - ], - }, - cols={ - 's': [ - female_sample, - male_sample_1, - male_sample_2, - x0_sample, - xxy_sample, - xyy_sample, - xxx_sample, - ], - }, - entries={ - 'GT': [ - [ - hl.Call(alleles=[0], phased=False), # invalid Female call - hl.Call(alleles=[0], phased=False), # valid Male call - hl.missing(hl.tcall), # invalid Male call - hl.Call(alleles=[0], phased=False), # invalid X0 call - hl.Call(alleles=[0], phased=False), # invalid XXY call - hl.missing(hl.tcall), # valid XYY call - hl.Call(alleles=[0, 0], phased=False), # valid XXX call + mt = ( + hl.MatrixTable.from_parts( + rows={ + 'locus': [ + hl.Locus( + contig='chrX', + position=1, + reference_genome='GRCh38', + ), ], - ], - }, - ).key_rows_by('locus') + }, + cols={ + 's': [ + female_sample, + male_sample_1, + male_sample_2, + x0_sample, + xxy_sample, + xyy_sample, + xxx_sample, + ], + }, + entries={ + 'GT': [ + [ + hl.Call(alleles=[0], phased=False), # invalid Female call + hl.Call(alleles=[0], phased=False), # valid Male call + hl.missing(hl.tcall), # invalid Male call + hl.Call(alleles=[0], phased=False), # invalid X0 call + hl.Call(alleles=[0], phased=False), # invalid XXY call + hl.missing(hl.tcall), # valid XYY call + hl.Call(alleles=[0, 0], phased=False), # valid XXX call + ], + ], + }, + ) + .key_rows_by('locus') + .key_cols_by('s') + ) self.assertRaisesRegex( SeqrValidationError, - '57.14% of samples have misaligned ploidy', + "Found samples with misaligned ploidy with their provided imputed sex \\(first 10, if applicable\\) : \\['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1'\\].*", validate_imputed_sex_ploidy, mt, sex_check_ht, @@ -253,34 +277,38 @@ def test_validate_imported_field_types(self) -> None: ) def test_validate_no_duplicate_variants(self) -> None: - mt = hl.MatrixTable.from_parts( - rows={ - 'locus': [ - hl.Locus( - contig='chr1', - position=1, - reference_genome='GRCh38', - ), - hl.Locus( - contig='chr1', - position=2, - reference_genome='GRCh38', - ), - hl.Locus( - contig='chr1', - position=2, - reference_genome='GRCh38', - ), - ], - 'alleles': [ - ['A', 'C'], - ['A', 'C'], - ['A', 'C'], - ], - }, - cols={'s': ['sample_1']}, - entries={'HL': [[0.0], [0.0], [0.0]]}, - ).key_rows_by('locus', 'alleles') + mt = ( + hl.MatrixTable.from_parts( + rows={ + 'locus': [ + hl.Locus( + contig='chr1', + position=1, + reference_genome='GRCh38', + ), + hl.Locus( + contig='chr1', + position=2, + reference_genome='GRCh38', + ), + hl.Locus( + contig='chr1', + position=2, + reference_genome='GRCh38', + ), + ], + 'alleles': [ + ['A', 'C'], + ['A', 'C'], + ['A', 'C'], + ], + }, + cols={'s': ['sample_1']}, + entries={'HL': [[0.0], [0.0], [0.0]]}, + ) + .key_rows_by('locus', 'alleles') + .key_cols_by('s') + ) self.assertRaisesRegex( SeqrValidationError, "Variants are present multiple times in the callset: \\['1-2-A-C'\\]", diff --git a/v03_pipeline/lib/tasks/validate_callset_test.py b/v03_pipeline/lib/tasks/validate_callset_test.py index a6d2b0377..8f3638376 100644 --- a/v03_pipeline/lib/tasks/validate_callset_test.py +++ b/v03_pipeline/lib/tasks/validate_callset_test.py @@ -83,6 +83,5 @@ def test_validate_callset_multiple_exceptions( 'Missing the following expected contigs:chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr20, chr21, chr22, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX', 'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants', ], - 'failed_family_samples': {}, }, ) diff --git a/v03_pipeline/lib/tasks/write_validation_errors_for_run.py b/v03_pipeline/lib/tasks/write_validation_errors_for_run.py index d99800f6f..c59fce7e0 100644 --- a/v03_pipeline/lib/tasks/write_validation_errors_for_run.py +++ b/v03_pipeline/lib/tasks/write_validation_errors_for_run.py @@ -15,7 +15,7 @@ class WriteValidationErrorsForRunTask(luigi.Task): project_guids = luigi.ListParameter() error_messages = luigi.ListParameter(default=[]) - failed_family_samples = luigi.DictParameter(default={}) + error_body = luigi.DictParameter(default={}) def to_single_error_message(self) -> str: with self.output().open('r') as f: @@ -37,8 +37,8 @@ def run(self) -> None: validation_errors_json = { 'project_guids': self.project_guids, 'error_messages': self.error_messages, - 'failed_family_samples': luigi.freezing.recursively_unfreeze( - self.failed_family_samples, + **luigi.freezing.recursively_unfreeze( + self.error_body, ), } with self.output().open('w') as f: @@ -57,7 +57,7 @@ def wrapper(self: luigi.Task): write_validation_errors_for_run_task = self.clone( WriteValidationErrorsForRunTask, error_messages=[str(e.args[0])], - failed_family_samples=e.args[1]['failed_family_samples'], + error_body=e.args[1], ) else: write_validation_errors_for_run_task = self.clone(