Skip to content

Commit 03551af

Browse files
authored
Create dataproc cluster task (#933) (#954)
* Support gcs dirs in rsync * ws * Add create dataproc cluster task * add dataproc * ruff * requirements * still struggling * Gencode refactor to remove gcs * bump reqs
1 parent 199184a commit 03551af

File tree

8 files changed

+350
-38
lines changed

8 files changed

+350
-38
lines changed

requirements.in

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
elasticsearch==7.9.1
2-
google-api-python-client>=1.8.0
31
hail==0.2.132
4-
luigi>=3.4.0
2+
luigi==3.5.2
53
gnomad==0.6.4
64
aiofiles==24.1.0
75
pydantic==2.8.2
6+
google-cloud-dataproc==5.14.0

requirements.txt

Lines changed: 33 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# This file is autogenerated by pip-compile with Python 3.10
33
# by the following command:
44
#
5-
# pip-compile requirements.in
5+
# pip-compile --resolver=backtracking requirements.in
66
#
77
aiodns==2.0.0
88
# via hail
@@ -28,25 +28,25 @@ avro==1.11.3
2828
# via hail
2929
azure-common==1.1.28
3030
# via azure-mgmt-storage
31-
azure-core==1.31.0
31+
azure-core==1.32.0
3232
# via
3333
# azure-identity
3434
# azure-mgmt-core
3535
# azure-storage-blob
3636
# msrest
3737
azure-identity==1.19.0
3838
# via hail
39-
azure-mgmt-core==1.4.0
39+
azure-mgmt-core==1.5.0
4040
# via azure-mgmt-storage
4141
azure-mgmt-storage==20.1.0
4242
# via hail
4343
azure-storage-blob==12.23.1
4444
# via hail
4545
bokeh==3.3.4
4646
# via hail
47-
boto3==1.35.48
47+
boto3==1.35.53
4848
# via hail
49-
botocore==1.35.48
49+
botocore==1.35.53
5050
# via
5151
# boto3
5252
# hail
@@ -55,7 +55,6 @@ cachetools==5.5.0
5555
# via google-auth
5656
certifi==2024.8.30
5757
# via
58-
# elasticsearch
5958
# msrest
6059
# requests
6160
cffi==1.17.1
@@ -86,10 +85,6 @@ deprecated==1.2.14
8685
# via hail
8786
dill==0.3.9
8887
# via hail
89-
docutils==0.21.2
90-
# via python-daemon
91-
elasticsearch==7.9.1
92-
# via -r requirements.in
9388
exceptiongroup==1.2.2
9489
# via ipython
9590
executing==2.1.0
@@ -101,38 +96,44 @@ frozenlist==1.5.0
10196
# hail
10297
gnomad==0.6.4
10398
# via -r requirements.in
104-
google-api-core==2.21.0
105-
# via google-api-python-client
106-
google-api-python-client==2.149.0
107-
# via -r requirements.in
99+
google-api-core[grpc]==2.22.0
100+
# via google-cloud-dataproc
108101
google-auth==2.35.0
109102
# via
110103
# google-api-core
111-
# google-api-python-client
112-
# google-auth-httplib2
113104
# google-auth-oauthlib
105+
# google-cloud-dataproc
114106
# hail
115-
google-auth-httplib2==0.2.0
116-
# via google-api-python-client
117107
google-auth-oauthlib==0.8.0
118108
# via hail
119-
googleapis-common-protos==1.65.0
109+
google-cloud-dataproc==5.14.0
110+
# via -r requirements.in
111+
googleapis-common-protos[grpc]==1.65.0
112+
# via
113+
# google-api-core
114+
# grpc-google-iam-v1
115+
# grpcio-status
116+
grpc-google-iam-v1==0.13.1
117+
# via google-cloud-dataproc
118+
grpcio==1.67.1
119+
# via
120+
# google-api-core
121+
# googleapis-common-protos
122+
# grpc-google-iam-v1
123+
# grpcio-status
124+
grpcio-status==1.48.2
120125
# via google-api-core
121126
hail==0.2.132
122127
# via -r requirements.in
123128
hdbscan==0.8.39
124129
# via gnomad
125-
httplib2==0.22.0
126-
# via
127-
# google-api-python-client
128-
# google-auth-httplib2
129130
humanize==1.1.0
130131
# via hail
131132
idna==3.10
132133
# via
133134
# requests
134135
# yarl
135-
ipython==8.28.0
136+
ipython==8.29.0
136137
# via ipywidgets
137138
ipywidgets==8.1.5
138139
# via gnomad
@@ -218,11 +219,16 @@ prompt-toolkit==3.0.48
218219
propcache==0.2.0
219220
# via yarl
220221
proto-plus==1.25.0
221-
# via google-api-core
222+
# via
223+
# google-api-core
224+
# google-cloud-dataproc
222225
protobuf==3.20.2
223226
# via
224227
# google-api-core
228+
# google-cloud-dataproc
225229
# googleapis-common-protos
230+
# grpc-google-iam-v1
231+
# grpcio-status
226232
# hail
227233
# proto-plus
228234
ptyprocess==0.7.0
@@ -251,11 +257,9 @@ pygments==2.18.0
251257
# rich
252258
pyjwt[crypto]==2.9.0
253259
# via msal
254-
pyparsing==3.2.0
255-
# via httplib2
256260
pyspark==3.5.3
257261
# via hail
258-
python-daemon==3.0.1
262+
python-daemon==3.1.0
259263
# via luigi
260264
python-dateutil==2.9.0.post0
261265
# via
@@ -348,12 +352,9 @@ typing-extensions==4.12.2
348352
# typer
349353
tzdata==2024.2
350354
# via pandas
351-
uritemplate==4.1.1
352-
# via google-api-python-client
353355
urllib3==2.2.3
354356
# via
355357
# botocore
356-
# elasticsearch
357358
# requests
358359
uvloop==0.21.0
359360
# via hail
@@ -365,7 +366,7 @@ wrapt==1.16.0
365366
# via deprecated
366367
xyzservices==2024.9.0
367368
# via bokeh
368-
yarl==1.16.0
369+
yarl==1.17.1
369370
# via aiohttp
370371

371372
# The following packages are considered to be unsafe in a requirements file:

v03_pipeline/bin/dataproc_vep_init.bash

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
set -x
1515

1616
export PROJECT="$(gcloud config get-value project)"
17-
export ENVIRONMENT="$(/usr/share/google/get_metadata_value attributes/ENVIRONMENT)"
17+
export DEPLOYMENT_TYPE="$(/usr/share/google/get_metadata_value attributes/DEPLOYMENT_TYPE)"
1818
export REFERENCE_GENOME="$(/usr/share/google/get_metadata_value attributes/REFERENCE_GENOME)"
1919
export PIPELINE_RUNNER_APP_VERSION="$(/usr/share/google/get_metadata_value attributes/PIPELINE_RUNNER_APP_VERSION)"
2020

@@ -53,10 +53,10 @@ EOF
5353
gcc -Wall -Werror -O2 /vep.c -o /vep
5454
chmod u+s /vep
5555

56-
gcloud storage cp gs://seqr-pipeline-runner-builds/$ENVIRONMENT/$PIPELINE_RUNNER_APP_VERSION/bin/download_vep_reference_data.bash /download_vep_reference_data.bash
56+
gcloud storage cp gs://seqr-pipeline-runner-builds/$DEPLOYMENT_TYPE/$PIPELINE_RUNNER_APP_VERSION/bin/download_vep_reference_data.bash /download_vep_reference_data.bash
5757
chmod +x /download_vep_reference_data.bash
5858
./download_vep_reference_data.bash $REFERENCE_GENOME
5959

60-
gcloud storage cp gs://seqr-pipeline-runner-builds/$ENVIRONMENT/$PIPELINE_RUNNER_APP_VERSION/bin/vep /vep.bash
60+
gcloud storage cp gs://seqr-pipeline-runner-builds/$DEPLOYMENT_TYPE/$PIPELINE_RUNNER_APP_VERSION/bin/vep /vep.bash
6161
chmod +x /vep.bash
6262

v03_pipeline/lib/model/dataset_type.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,10 @@ def lookup_table_annotation_fns(self) -> list[Callable[..., hl.Expression]]:
361361
def should_send_to_allele_registry(self):
362362
return self == DatasetType.SNV_INDEL
363363

364+
@property
365+
def requires_dataproc(self):
366+
return self == DatasetType.SNV_INDEL
367+
364368
@property
365369
def should_export_to_vcf(self):
366370
return self == DatasetType.SV

v03_pipeline/lib/model/environment.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
from dataclasses import dataclass
3+
from typing import Literal
34

45
# NB: using os.environ.get inside the dataclass defaults gives a lint error.
56
GRCH37_TO_GRCH38_LIFTOVER_REF_PATH = os.environ.get(
@@ -41,6 +42,14 @@
4142
'CLINGEN_ALLELE_REGISTRY_PASSWORD',
4243
'',
4344
)
45+
DEPLOYMENT_TYPE = os.environ.get('DEPLOYMENT_TYPE', 'prod')
46+
GCLOUD_DATAPROC_SECONDARY_WORKERS = int(
47+
os.environ.get('GCLOUD_DATAPROC_SECONDARY_WORKERS', '5'),
48+
)
49+
GCLOUD_PROJECT = os.environ.get('GCLOUD_PROJECT')
50+
GCLOUD_ZONE = os.environ.get('GCLOUD_ZONE')
51+
GCLOUD_REGION = os.environ.get('GCLOUD_REGION')
52+
PIPELINE_RUNNER_APP_VERSION = os.environ.get('PIPELINE_RUNNER_APP_VERSION', 'latest')
4453

4554
# Feature Flags
4655
ACCESS_PRIVATE_REFERENCE_DATASETS = (
@@ -62,7 +71,12 @@ class Env:
6271
CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS
6372
CLINGEN_ALLELE_REGISTRY_LOGIN: str | None = CLINGEN_ALLELE_REGISTRY_LOGIN
6473
CLINGEN_ALLELE_REGISTRY_PASSWORD: str | None = CLINGEN_ALLELE_REGISTRY_PASSWORD
74+
DEPLOYMENT_TYPE: Literal['dev', 'prod'] = DEPLOYMENT_TYPE
6575
EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS
76+
GCLOUD_DATAPROC_SECONDARY_WORKERS: str = GCLOUD_DATAPROC_SECONDARY_WORKERS
77+
GCLOUD_PROJECT: str | None = GCLOUD_PROJECT
78+
GCLOUD_ZONE: str | None = GCLOUD_ZONE
79+
GCLOUD_REGION: str | None = GCLOUD_REGION
6680
GRCH37_TO_GRCH38_LIFTOVER_REF_PATH: str = GRCH37_TO_GRCH38_LIFTOVER_REF_PATH
6781
GRCH38_TO_GRCH37_LIFTOVER_REF_PATH: str = GRCH38_TO_GRCH37_LIFTOVER_REF_PATH
6882
HAIL_BACKEND_SERVICE_HOSTNAME: str | None = HAIL_BACKEND_SERVICE_HOSTNAME
@@ -71,6 +85,7 @@ class Env:
7185
HAIL_SEARCH_DATA_DIR: str = HAIL_SEARCH_DATA_DIR
7286
INCLUDE_PIPELINE_VERSION_IN_PREFIX: bool = INCLUDE_PIPELINE_VERSION_IN_PREFIX
7387
LOADING_DATASETS_DIR: str = LOADING_DATASETS_DIR
88+
PIPELINE_RUNNER_APP_VERSION: str = PIPELINE_RUNNER_APP_VERSION
7489
PRIVATE_REFERENCE_DATASETS_DIR: str = PRIVATE_REFERENCE_DATASETS_DIR
7590
REFERENCE_DATASETS_DIR: str = REFERENCE_DATASETS_DIR
7691
SHOULD_TRIGGER_HAIL_BACKEND_RELOAD: bool = SHOULD_TRIGGER_HAIL_BACKEND_RELOAD

v03_pipeline/lib/tasks/dataproc/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)