feat: Add ability to run tasks dataproc. (#948)

bpblanken · web-flow · commit 25db277c99c4 · 2025-01-06T13:57:30.000-05:00
* Support gcs dirs in rsync

* ws

* Add create dataproc cluster task

* add dataproc

* ruff

* requirements

* still struggling

* Gencode refactor to remove gcs

* bump reqs

* Run dataproc job

* lib

* running

* merge requirements

* Flip'em

* Better exception handling

* Cleaner approach if less generalizable

* write a test

* Fix tests

* lint

* Add test for success

* refactor to use a base class... better for adding support for multiple jobs

* cleanup

* ruff

* Fix missing mock

* Fix flapping test

* pr comments
diff --git a/requirements.txt b/requirements.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --resolver=backtracking requirements.in
+#    pip-compile requirements.in
 #
 aiodns==2.0.0
     # via hail
diff --git a/v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py b/v03_pipeline/lib/tasks/dataproc/base_run_job_on_dataproc.py
@@ -0,0 +1,106 @@
+import time
+
+import google.api_core.exceptions
+import luigi
+from google.cloud import dataproc_v1 as dataproc
+
+from v03_pipeline.lib.logger import get_logger
+from v03_pipeline.lib.model import Env
+from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import (
+    BaseLoadingPipelineParams,
+)
+from v03_pipeline.lib.tasks.dataproc.create_dataproc_cluster import (
+    CreateDataprocClusterTask,
+)
+from v03_pipeline.lib.tasks.dataproc.misc import get_cluster_name, to_kebab_str_args
+
+DONE_STATE = 'DONE'
+ERROR_STATE = 'ERROR'
+SEQR_PIPELINE_RUNNER_BUILD = f'gs://seqr-pipeline-runner-builds/{Env.DEPLOYMENT_TYPE}/{Env.PIPELINE_RUNNER_APP_VERSION}'
+TIMEOUT_S = 172800  # 2 days
+
+logger = get_logger(__name__)
+
+
+@luigi.util.inherits(BaseLoadingPipelineParams)
+class BaseRunJobOnDataprocTask(luigi.Task):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.client = dataproc.JobControllerClient(
+            client_options={
+                'api_endpoint': f'{Env.GCLOUD_REGION}-dataproc.googleapis.com:443',
+            },
+        )
+
+    @property
+    def task_name(self):
+        return self.get_task_family().split('.')[-1]
+
+    @property
+    def job_id(self):
+        return f'{self.task_name}-{self.run_id}'
+
+    def requires(self) -> [luigi.Task]:
+        return [self.clone(CreateDataprocClusterTask)]
+
+    def complete(self) -> bool:
+        if not self.dataset_type.requires_dataproc:
+            msg = f'{self.dataset_type} should not require a dataproc job'
+            raise RuntimeError(msg)
+        try:
+            job = self.client.get_job(
+                request={
+                    'project_id': Env.GCLOUD_PROJECT,
+                    'region': Env.GCLOUD_REGION,
+                    'job_id': self.job_id,
+                },
+            )
+        except google.api_core.exceptions.NotFound:
+            return False
+        if job.status.state == ERROR_STATE:
+            msg = f'Job {self.task_name}-{self.run_id} entered ERROR state'
+            logger.error(msg)
+            logger.error(job.status.details)
+        return job.status.state == DONE_STATE
+
+    def run(self):
+        operation = self.client.submit_job_as_operation(
+            request={
+                'project_id': Env.GCLOUD_PROJECT,
+                'region': Env.GCLOUD_REGION,
+                'job': {
+                    'reference': {
+                        'job_id': self.job_id,
+                    },
+                    'placement': {
+                        'cluster_name': get_cluster_name(
+                            self.reference_genome,
+                            self.run_id,
+                        ),
+                    },
+                    'pyspark_job': {
+                        'main_python_file_uri': f'{SEQR_PIPELINE_RUNNER_BUILD}/bin/run_task.py',
+                        'args': [
+                            self.task_name,
+                            '--local-scheduler',
+                            *to_kebab_str_args(self),
+                        ],
+                        'python_file_uris': [
+                            f'{SEQR_PIPELINE_RUNNER_BUILD}/pyscripts.zip',
+                        ],
+                    },
+                },
+            },
+        )
+        wait_s = 0
+        while wait_s < TIMEOUT_S:
+            if operation.done():
+                operation.result()  # Will throw on failure!
+                msg = f'Finished {self.job_id}'
+                logger.info(msg)
+                break
+            logger.info(
+                f'Waiting for job completion {self.job_id}',
+            )
+            time.sleep(3)
+            wait_s += 3
diff --git a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster.py
@@ -1,5 +1,6 @@
 import time
 
+import google.api_core.exceptions
 import hail as hl
 import luigi
 from google.cloud import dataproc_v1 as dataproc
@@ -11,13 +12,15 @@
 from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import (
     BaseLoadingPipelineParams,
 )
+from v03_pipeline.lib.tasks.dataproc.misc import get_cluster_name
 
-CLUSTER_NAME_PREFIX = 'pipeline-runner'
 DEBIAN_IMAGE = '2.1.33-debian11'
+ERROR_STATE = 'ERROR'
 HAIL_VERSION = hl.version().split('-')[0]
 INSTANCE_TYPE = 'n1-highmem-8'
 PKGS = '|'.join(pip_freeze.freeze())
-SUCCESS_STATE = 'RUNNING'
+RUNNING_STATE = 'RUNNING'
+TIMEOUT_S = 900
 
 logger = get_logger(__name__)
 
@@ -26,7 +29,7 @@ def get_cluster_config(reference_genome: ReferenceGenome, run_id: str):
     service_account_credentials = get_service_account_credentials()
     return {
         'project_id': Env.GCLOUD_PROJECT,
-        'cluster_name': f'{CLUSTER_NAME_PREFIX}-{reference_genome.value.lower()}-{run_id}',
+        'cluster_name': get_cluster_name(reference_genome, run_id),
         # Schema found at https://cloud.google.com/dataproc/docs/reference/rest/v1/ClusterConfig
         'config': {
             'gce_cluster_config': {
@@ -136,27 +139,32 @@ def __init__(self, *args, **kwargs):
         # https://cloud.google.com/dataproc/docs/tutorials/python-library-example
         self.client = dataproc.ClusterControllerClient(
             client_options={
-                'api_endpoint': f'{Env.GCLOUD_REGION}-dataproc.googleapis.com:443'.format(
-                    Env.GCLOUD_REGION,
-                ),
+                'api_endpoint': f'{Env.GCLOUD_REGION}-dataproc.googleapis.com:443',
             },
         )
 
     def complete(self) -> bool:
         if not self.dataset_type.requires_dataproc:
-            return True
+            msg = f'{self.dataset_type} should not require a dataproc cluster'
+            raise RuntimeError(msg)
         try:
-            client = self.client.get_cluster(
+            cluster = self.client.get_cluster(
                 request={
                     'project_id': Env.GCLOUD_PROJECT,
                     'region': Env.GCLOUD_REGION,
-                    'cluster_name': f'{CLUSTER_NAME_PREFIX}-{self.reference_genome.value.lower()}',
+                    'cluster_name': get_cluster_name(
+                        self.reference_genome,
+                        self.run_id,
+                    ),
                 },
             )
-        except Exception:  # noqa: BLE001
+        except google.api_core.exceptions.NotFound:
             return False
-        else:
-            return client.status.state == SUCCESS_STATE
+        if cluster.status.state == ERROR_STATE:
+            msg = f'Cluster {cluster.cluster_name} entered ERROR state'
+            logger.error(msg)
+        # This will return False when the cluster is "CREATING"
+        return cluster.status.state == RUNNING_STATE
 
     def run(self):
         operation = self.client.create_cluster(
@@ -166,11 +174,13 @@ def run(self):
                 'cluster': get_cluster_config(self.reference_genome, self.run_id),
             },
         )
-        while True:
+        wait_s = 0
+        while wait_s < TIMEOUT_S:
             if operation.done():
                 result = operation.result()  # Will throw on failure!
                 msg = f'Created cluster {result.cluster_name} with cluster uuid: {result.cluster_uuid}'
                 logger.info(msg)
                 break
             logger.info('Waiting for cluster spinup')
             time.sleep(3)
+            wait_s += 3
diff --git a/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py b/v03_pipeline/lib/tasks/dataproc/create_dataproc_cluster_test.py
@@ -28,15 +28,12 @@ def test_dataset_type_unsupported(
         mock_cluster_controller: Mock,
         _: Mock,
     ) -> None:
-        worker = luigi.worker.Worker()
         task = CreateDataprocClusterTask(
             reference_genome=ReferenceGenome.GRCh38,
             dataset_type=DatasetType.MITO,
             run_id='1',
         )
-        worker.add(task)
-        worker.run()
-        self.assertTrue(task.complete())
+        self.assertRaises(RuntimeError, task.complete)
 
     def test_spinup_cluster_already_exists_failed(
         self,
@@ -45,7 +42,8 @@ def test_spinup_cluster_already_exists_failed(
     ) -> None:
         mock_client = mock_cluster_controller.return_value
         mock_client.get_cluster.return_value = SimpleNamespace(
-            status=SimpleNamespace(state='FAILED'),
+            status=SimpleNamespace(state='ERROR'),
+            cluster_name='abc',
         )
         mock_client.create_cluster.side_effect = (
             google.api_core.exceptions.AlreadyExists('cluster exists')
@@ -122,7 +120,7 @@ def test_spinup_cluster_doesnt_exist_success(
         operation = mock_client.create_cluster.return_value
         operation.done.side_effect = [False, True]
         operation.result.return_value = SimpleNamespace(
-            cluster_name='dataproc-cluster-1',
+            cluster_name='dataproc-cluster-5',
             cluster_uuid='12345',
         )
         worker = luigi.worker.Worker()
@@ -136,6 +134,6 @@ def test_spinup_cluster_doesnt_exist_success(
         mock_logger.info.assert_has_calls(
             [
                 call('Waiting for cluster spinup'),
-                call('Created cluster dataproc-cluster-1 with cluster uuid: 12345'),
+                call('Created cluster dataproc-cluster-5 with cluster uuid: 12345'),
             ],
         )
diff --git a/v03_pipeline/lib/tasks/dataproc/misc.py b/v03_pipeline/lib/tasks/dataproc/misc.py
@@ -0,0 +1,21 @@
+import re
+
+import luigi
+
+from v03_pipeline.lib.model import ReferenceGenome
+
+CLUSTER_NAME_PREFIX = 'pipeline-runner'
+
+
+def get_cluster_name(reference_genome: ReferenceGenome, run_id: str):
+    return f'{CLUSTER_NAME_PREFIX}-{reference_genome.value.lower()}-{run_id}'
+
+
+def snake_to_kebab_arg(snake_string: str) -> str:
+    return '--' + re.sub(r'\_', '-', snake_string).lower()
+
+
+def to_kebab_str_args(task: luigi.Task):
+    return [
+        e for k, v in task.to_str_params().items() for e in (snake_to_kebab_arg(k), v)
+    ]
diff --git a/v03_pipeline/lib/tasks/dataproc/misc_test.py b/v03_pipeline/lib/tasks/dataproc/misc_test.py
@@ -0,0 +1,58 @@
+import unittest
+from unittest.mock import Mock, patch
+
+from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
+from v03_pipeline.lib.tasks.dataproc.misc import to_kebab_str_args
+from v03_pipeline.lib.tasks.dataproc.write_success_file_on_dataproc import (
+    WriteSuccessFileOnDataprocTask,
+)
+
+
+@patch(
+    'v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc.dataproc.JobControllerClient',
+)
+class MiscTest(unittest.TestCase):
+    def test_to_kebab_str_args(self, _: Mock):
+        t = WriteSuccessFileOnDataprocTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SNV_INDEL,
+            sample_type=SampleType.WGS,
+            callset_path='test_callset',
+            project_guids=['R0113_test_project'],
+            project_remap_paths=['test_remap'],
+            project_pedigree_paths=['test_pedigree'],
+            run_id='a_misc_run',
+        )
+        self.assertListEqual(
+            to_kebab_str_args(t),
+            [
+                '--reference-genome',
+                'GRCh38',
+                '--dataset-type',
+                'SNV_INDEL',
+                '--run-id',
+                'a_misc_run',
+                '--sample-type',
+                'WGS',
+                '--callset-path',
+                'test_callset',
+                '--project-guids',
+                '["R0113_test_project"]',
+                '--project-remap-paths',
+                '["test_remap"]',
+                '--project-pedigree-paths',
+                '["test_pedigree"]',
+                '--ignore-missing-samples-when-remapping',
+                'False',
+                '--skip-check-sex-and-relatedness',
+                'False',
+                '--skip-expect-filters',
+                'False',
+                '--skip-expect-tdr-metrics',
+                'False',
+                '--skip-validation',
+                'False',
+                '--is-new-gcnv-joint-call',
+                'False',
+            ],
+        )
diff --git a/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py b/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc.py
@@ -0,0 +1,22 @@
+import luigi
+
+from v03_pipeline.lib.paths import pipeline_run_success_file_path
+from v03_pipeline.lib.tasks.base.base_loading_run_params import (
+    BaseLoadingRunParams,
+)
+from v03_pipeline.lib.tasks.dataproc.base_run_job_on_dataproc import (
+    BaseRunJobOnDataprocTask,
+)
+from v03_pipeline.lib.tasks.files import GCSorLocalTarget
+
+
+@luigi.util.inherits(BaseLoadingRunParams)
+class WriteSuccessFileOnDataprocTask(BaseRunJobOnDataprocTask):
+    def output(self) -> luigi.Target:
+        return GCSorLocalTarget(
+            pipeline_run_success_file_path(
+                self.reference_genome,
+                self.dataset_type,
+                self.run_id,
+            ),
+        )
diff --git a/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc_test.py b/v03_pipeline/lib/tasks/dataproc/write_success_file_on_dataproc_test.py

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`	`# This file is autogenerated by pip-compile with Python 3.10`
`3`	`3`	`# by the following command:`
`4`	`4`	`#`
`5`		`-# pip-compile --resolver=backtracking requirements.in`
	`5`	`+# pip-compile requirements.in`
`6`	`6`	`#`
`7`	`7`	`aiodns==2.0.0`
`8`	`8`	`# via hail`