feat: Add rsync task between hail search data/reference data stored in gcs and on local disk. (#1006)

bpblanken · web-flow · commit db64b2d12719 · 2025-01-10T14:02:57.000-05:00
* Support gcs dirs in rsync

* ws

* Add create dataproc cluster task

* add dataproc

* ruff

* requirements

* still struggling

* Gencode refactor to remove gcs

* bump reqs

* Run dataproc job

* lib

* running

* merge requirements

* Flip'em

* Better exception handling

* Cleaner approach if less generalizable

* write a test

* Fix tests

* lint

* Add test for success

* refactor to use a base class... better for adding support for multiple jobs

* cleanup

* ruff

* Fix missing mock

* Fix flapping test

* first commit

* Finish test and cleanup

* Allow any order
diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py
@@ -50,6 +50,8 @@
 GCLOUD_ZONE = os.environ.get('GCLOUD_ZONE')
 GCLOUD_REGION = os.environ.get('GCLOUD_REGION')
 PIPELINE_RUNNER_APP_VERSION = os.environ.get('PIPELINE_RUNNER_APP_VERSION', 'latest')
+SEQR_APP_HAIL_SEARCH_DATA_DIR = os.environ.get('SEQR_APP_HAIL_SEARCH_DATA_DIR')
+SEQR_APP_REFERENCE_DATASETS_DIR = os.environ.get('SEQR_APP_REFERENCE_DATASETS_DIR')
 
 
 @dataclass
@@ -71,4 +73,6 @@ class Env:
     PIPELINE_RUNNER_APP_VERSION: str = PIPELINE_RUNNER_APP_VERSION
     PRIVATE_REFERENCE_DATASETS_DIR: str = PRIVATE_REFERENCE_DATASETS_DIR
     REFERENCE_DATASETS_DIR: str = REFERENCE_DATASETS_DIR
+    SEQR_APP_HAIL_SEARCH_DATA_DIR: str | None = SEQR_APP_HAIL_SEARCH_DATA_DIR
+    SEQR_APP_REFERENCE_DATASETS_DIR: str | None = SEQR_APP_REFERENCE_DATASETS_DIR
     VEP_REFERENCE_DATASETS_DIR: str = VEP_REFERENCE_DATASETS_DIR
diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py
@@ -17,7 +17,7 @@
 )
 
 
-def _pipeline_prefix(
+def pipeline_prefix(
     root: str,
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,
@@ -36,38 +36,15 @@ def _pipeline_prefix(
     )
 
 
-def _v03_reference_data_prefix(
-    access_control: AccessControl,
-    reference_genome: ReferenceGenome,
-    dataset_type: DatasetType,
-) -> str:
-    root = (
-        Env.PRIVATE_REFERENCE_DATASETS_DIR
-        if access_control == AccessControl.PRIVATE
-        else Env.REFERENCE_DATASETS_DIR
-    )
-    if FeatureFlag.INCLUDE_PIPELINE_VERSION_IN_PREFIX:
-        return os.path.join(
-            root,
-            PipelineVersion.V03.value,
-            reference_genome.value,
-            dataset_type.value,
-        )
-    return os.path.join(
-        root,
-        reference_genome.value,
-        dataset_type.value,
-    )
-
-
 def _v03_reference_dataset_prefix(
+    root: str,
     access_control: AccessControl,
     reference_genome: ReferenceGenome,
 ) -> str:
     root = (
         Env.PRIVATE_REFERENCE_DATASETS_DIR
         if access_control == AccessControl.PRIVATE
-        else Env.REFERENCE_DATASETS_DIR
+        else root
     )
     if FeatureFlag.INCLUDE_PIPELINE_VERSION_IN_PREFIX:
         return os.path.join(
@@ -88,7 +65,7 @@ def family_table_path(
     family_guid: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.HAIL_SEARCH_DATA_DIR,
             reference_genome,
             dataset_type,
@@ -104,7 +81,7 @@ def tdr_metrics_dir(
     dataset_type: DatasetType,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.LOADING_DATASETS_DIR,
             reference_genome,
             dataset_type,
@@ -130,7 +107,7 @@ def imported_callset_path(
     callset_path: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.LOADING_DATASETS_DIR,
             reference_genome,
             dataset_type,
@@ -177,7 +154,7 @@ def project_table_path(
     project_guid: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.HAIL_SEARCH_DATA_DIR,
             reference_genome,
             dataset_type,
@@ -194,7 +171,7 @@ def relatedness_check_table_path(
     callset_path: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.LOADING_DATASETS_DIR,
             reference_genome,
             dataset_type,
@@ -210,7 +187,7 @@ def relatedness_check_tsv_path(
     callset_path: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.LOADING_DATASETS_DIR,
             reference_genome,
             dataset_type,
@@ -227,7 +204,7 @@ def remapped_and_subsetted_callset_path(
     project_guid: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.LOADING_DATASETS_DIR,
             reference_genome,
             dataset_type,
@@ -243,7 +220,7 @@ def lookup_table_path(
     dataset_type: DatasetType,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.HAIL_SEARCH_DATA_DIR,
             reference_genome,
             dataset_type,
@@ -257,7 +234,7 @@ def runs_path(
     dataset_type: DatasetType,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.HAIL_SEARCH_DATA_DIR,
             reference_genome,
             dataset_type,
@@ -272,7 +249,7 @@ def sex_check_table_path(
     callset_path: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.LOADING_DATASETS_DIR,
             reference_genome,
             dataset_type,
@@ -306,6 +283,7 @@ def valid_reference_dataset_path(
 ) -> str | None:
     return os.path.join(
         _v03_reference_dataset_prefix(
+            Env.REFERENCE_DATASETS_DIR,
             reference_dataset.access_control,
             reference_genome,
         ),
@@ -318,9 +296,13 @@ def valid_reference_dataset_query_path(
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,
     reference_dataset_query: ReferenceDatasetQuery,
+    root=None,
 ) -> str | None:
+    if not root:
+        root = Env.REFERENCE_DATASETS_DIR
     return os.path.join(
         _v03_reference_dataset_prefix(
+            root,
             reference_dataset_query.access_control,
             reference_genome,
         ),
@@ -334,7 +316,7 @@ def variant_annotations_table_path(
     dataset_type: DatasetType,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.HAIL_SEARCH_DATA_DIR,
             reference_genome,
             dataset_type,
@@ -348,7 +330,7 @@ def variant_annotations_vcf_path(
     dataset_type: DatasetType,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.HAIL_SEARCH_DATA_DIR,
             reference_genome,
             dataset_type,
@@ -386,7 +368,7 @@ def project_remap_path(
     project_guid: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.LOADING_DATASETS_DIR,
             reference_genome,
             dataset_type,
@@ -404,7 +386,7 @@ def project_pedigree_path(
     project_guid: str,
 ) -> str:
     return os.path.join(
-        _pipeline_prefix(
+        pipeline_prefix(
             Env.LOADING_DATASETS_DIR,
             reference_genome,
             dataset_type,
diff --git a/v03_pipeline/lib/reference_datasets/reference_dataset.py b/v03_pipeline/lib/reference_datasets/reference_dataset.py
@@ -166,6 +166,21 @@ class ReferenceDatasetQuery(BaseReferenceDataset, str, Enum):
     clinvar_path_variants = 'clinvar_path_variants'
     high_af_variants = 'high_af_variants'
 
+    @classmethod
+    def for_reference_genome_dataset_type(
+        cls,
+        reference_genome: ReferenceGenome,
+        dataset_type: DatasetType,
+    ) -> set['ReferenceDatasetQuery']:
+        return {
+            dataset
+            for dataset in super().for_reference_genome_dataset_type(
+                reference_genome,
+                dataset_type,
+            )
+            if isinstance(dataset, cls)
+        }
+
     @property
     def requires(self) -> ReferenceDataset:
         return {
diff --git a/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs.py
@@ -0,0 +1,95 @@
+import os
+import subprocess
+
+import luigi
+
+from v03_pipeline.lib.model import Env
+from v03_pipeline.lib.paths import pipeline_prefix, valid_reference_dataset_query_path
+from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDatasetQuery
+from v03_pipeline.lib.tasks.base.base_loading_run_params import (
+    BaseLoadingRunParams,
+)
+
+
+def hail_search_value(value: str) -> str:
+    return value.replace('SV', 'SV_WGS').replace(
+        'GCNV',
+        'SV_WES',
+    )
+
+
+def rsync_command(src_path: str, dst_path: str) -> list[str]:
+    return [
+        '/bin/bash',
+        '-cx',
+        f'mkdir -p {dst_path} && gsutil -qm rsync -rd -x .*runs.* {src_path} {dst_path} && sync {dst_path}',
+    ]
+
+
+@luigi.util.inherits(BaseLoadingRunParams)
+class RsyncToSeqrAppDirsTask(luigi.Task):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.done = False
+
+    def output(self) -> None:
+        return None
+
+    def complete(self) -> bool:
+        return self.done
+
+    def run(self) -> None:
+        if not (
+            Env.SEQR_APP_HAIL_SEARCH_DATA_DIR and Env.SEQR_APP_REFERENCE_DATASETS_DIR
+        ):
+            self.done = True
+            return
+
+        if not (
+            Env.HAIL_SEARCH_DATA_DIR.startswith('gs://')
+            and Env.REFERENCE_DATASETS_DIR.startswith('gs://')
+        ):
+            msg = 'Overridden HAIL_SEARCH_DATA_DIR and REFERENCE_DATASETS_DIR must be Google Cloud buckets.'
+            raise RuntimeError(msg)
+
+        # Sync Pipeline Tables
+        src_path = pipeline_prefix(
+            Env.HAIL_SEARCH_DATA_DIR,
+            self.reference_genome,
+            self.dataset_type,
+        )
+        dst_path = hail_search_value(
+            pipeline_prefix(
+                Env.SEQR_APP_HAIL_SEARCH_DATA_DIR,
+                self.reference_genome,
+                self.dataset_type,
+            ),
+        )
+        subprocess.call(
+            rsync_command(src_path, dst_path),  # noqa: S603
+        )
+
+        # Sync RDQs
+        for query in ReferenceDatasetQuery.for_reference_genome_dataset_type(
+            self.reference_genome,
+            self.dataset_type,
+        ):
+            src_path = valid_reference_dataset_query_path(
+                self.reference_genome,
+                self.dataset_type,
+                query,
+            )
+            dst_path = os.path.join(
+                hail_search_value(
+                    valid_reference_dataset_query_path(
+                        self.reference_genome,
+                        self.dataset_type,
+                        query,
+                        Env.SEQR_APP_REFERENCE_DATASETS_DIR,
+                    ),
+                ),
+            )
+            subprocess.call(
+                rsync_command(src_path, dst_path),  # noqa: S603
+            )
+        self.done = True
diff --git a/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py b/v03_pipeline/lib/tasks/dataproc/rsync_to_seqr_app_dirs_test.py