Skip to content

Create dataproc cluster task (#933) #954

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions requirements.in
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
elasticsearch==7.9.1
google-api-python-client>=1.8.0
hail==0.2.132
luigi>=3.4.0
luigi==3.5.2
gnomad==0.6.4
aiofiles==24.1.0
pydantic==2.8.2
google-cloud-dataproc==5.14.0
65 changes: 33 additions & 32 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile requirements.in
# pip-compile --resolver=backtracking requirements.in
#
aiodns==2.0.0
# via hail
Expand All @@ -28,25 +28,25 @@ avro==1.11.3
# via hail
azure-common==1.1.28
# via azure-mgmt-storage
azure-core==1.31.0
azure-core==1.32.0
# via
# azure-identity
# azure-mgmt-core
# azure-storage-blob
# msrest
azure-identity==1.19.0
# via hail
azure-mgmt-core==1.4.0
azure-mgmt-core==1.5.0
# via azure-mgmt-storage
azure-mgmt-storage==20.1.0
# via hail
azure-storage-blob==12.23.1
# via hail
bokeh==3.3.4
# via hail
boto3==1.35.48
boto3==1.35.53
# via hail
botocore==1.35.48
botocore==1.35.53
# via
# boto3
# hail
Expand All @@ -55,7 +55,6 @@ cachetools==5.5.0
# via google-auth
certifi==2024.8.30
# via
# elasticsearch
# msrest
# requests
cffi==1.17.1
Expand Down Expand Up @@ -86,10 +85,6 @@ deprecated==1.2.14
# via hail
dill==0.3.9
# via hail
docutils==0.21.2
# via python-daemon
elasticsearch==7.9.1
# via -r requirements.in
exceptiongroup==1.2.2
# via ipython
executing==2.1.0
Expand All @@ -101,38 +96,44 @@ frozenlist==1.5.0
# hail
gnomad==0.6.4
# via -r requirements.in
google-api-core==2.21.0
# via google-api-python-client
google-api-python-client==2.149.0
# via -r requirements.in
google-api-core[grpc]==2.22.0
# via google-cloud-dataproc
google-auth==2.35.0
# via
# google-api-core
# google-api-python-client
# google-auth-httplib2
# google-auth-oauthlib
# google-cloud-dataproc
# hail
google-auth-httplib2==0.2.0
# via google-api-python-client
google-auth-oauthlib==0.8.0
# via hail
googleapis-common-protos==1.65.0
google-cloud-dataproc==5.14.0
# via -r requirements.in
googleapis-common-protos[grpc]==1.65.0
# via
# google-api-core
# grpc-google-iam-v1
# grpcio-status
grpc-google-iam-v1==0.13.1
# via google-cloud-dataproc
grpcio==1.67.1
# via
# google-api-core
# googleapis-common-protos
# grpc-google-iam-v1
# grpcio-status
grpcio-status==1.48.2
# via google-api-core
hail==0.2.132
# via -r requirements.in
hdbscan==0.8.39
# via gnomad
httplib2==0.22.0
# via
# google-api-python-client
# google-auth-httplib2
humanize==1.1.0
# via hail
idna==3.10
# via
# requests
# yarl
ipython==8.28.0
ipython==8.29.0
# via ipywidgets
ipywidgets==8.1.5
# via gnomad
Expand Down Expand Up @@ -218,11 +219,16 @@ prompt-toolkit==3.0.48
propcache==0.2.0
# via yarl
proto-plus==1.25.0
# via google-api-core
# via
# google-api-core
# google-cloud-dataproc
protobuf==3.20.2
# via
# google-api-core
# google-cloud-dataproc
# googleapis-common-protos
# grpc-google-iam-v1
# grpcio-status
# hail
# proto-plus
ptyprocess==0.7.0
Expand Down Expand Up @@ -251,11 +257,9 @@ pygments==2.18.0
# rich
pyjwt[crypto]==2.9.0
# via msal
pyparsing==3.2.0
# via httplib2
pyspark==3.5.3
# via hail
python-daemon==3.0.1
python-daemon==3.1.0
# via luigi
python-dateutil==2.9.0.post0
# via
Expand Down Expand Up @@ -348,12 +352,9 @@ typing-extensions==4.12.2
# typer
tzdata==2024.2
# via pandas
uritemplate==4.1.1
# via google-api-python-client
urllib3==2.2.3
# via
# botocore
# elasticsearch
# requests
uvloop==0.21.0
# via hail
Expand All @@ -365,7 +366,7 @@ wrapt==1.16.0
# via deprecated
xyzservices==2024.9.0
# via bokeh
yarl==1.16.0
yarl==1.17.1
# via aiohttp

# The following packages are considered to be unsafe in a requirements file:
Expand Down
6 changes: 3 additions & 3 deletions v03_pipeline/bin/dataproc_vep_init.bash
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
set -x

export PROJECT="$(gcloud config get-value project)"
export ENVIRONMENT="$(/usr/share/google/get_metadata_value attributes/ENVIRONMENT)"
export DEPLOYMENT_TYPE="$(/usr/share/google/get_metadata_value attributes/DEPLOYMENT_TYPE)"
export REFERENCE_GENOME="$(/usr/share/google/get_metadata_value attributes/REFERENCE_GENOME)"
export PIPELINE_RUNNER_APP_VERSION="$(/usr/share/google/get_metadata_value attributes/PIPELINE_RUNNER_APP_VERSION)"

Expand Down Expand Up @@ -53,10 +53,10 @@ EOF
gcc -Wall -Werror -O2 /vep.c -o /vep
chmod u+s /vep

gcloud storage cp gs://seqr-pipeline-runner-builds/$ENVIRONMENT/$PIPELINE_RUNNER_APP_VERSION/bin/download_vep_reference_data.bash /download_vep_reference_data.bash
gcloud storage cp gs://seqr-pipeline-runner-builds/$DEPLOYMENT_TYPE/$PIPELINE_RUNNER_APP_VERSION/bin/download_vep_reference_data.bash /download_vep_reference_data.bash
chmod +x /download_vep_reference_data.bash
./download_vep_reference_data.bash $REFERENCE_GENOME

gcloud storage cp gs://seqr-pipeline-runner-builds/$ENVIRONMENT/$PIPELINE_RUNNER_APP_VERSION/bin/vep /vep.bash
gcloud storage cp gs://seqr-pipeline-runner-builds/$DEPLOYMENT_TYPE/$PIPELINE_RUNNER_APP_VERSION/bin/vep /vep.bash
chmod +x /vep.bash

4 changes: 4 additions & 0 deletions v03_pipeline/lib/model/dataset_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,10 @@ def lookup_table_annotation_fns(self) -> list[Callable[..., hl.Expression]]:
def should_send_to_allele_registry(self):
return self == DatasetType.SNV_INDEL

@property
def requires_dataproc(self):
return self == DatasetType.SNV_INDEL

@property
def should_export_to_vcf(self):
return self == DatasetType.SV
Expand Down
15 changes: 15 additions & 0 deletions v03_pipeline/lib/model/environment.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from dataclasses import dataclass
from typing import Literal

# NB: using os.environ.get inside the dataclass defaults gives a lint error.
GRCH37_TO_GRCH38_LIFTOVER_REF_PATH = os.environ.get(
Expand Down Expand Up @@ -41,6 +42,14 @@
'CLINGEN_ALLELE_REGISTRY_PASSWORD',
'',
)
DEPLOYMENT_TYPE = os.environ.get('DEPLOYMENT_TYPE', 'prod')
GCLOUD_DATAPROC_SECONDARY_WORKERS = int(
os.environ.get('GCLOUD_DATAPROC_SECONDARY_WORKERS', '5'),
)
GCLOUD_PROJECT = os.environ.get('GCLOUD_PROJECT')
GCLOUD_ZONE = os.environ.get('GCLOUD_ZONE')
GCLOUD_REGION = os.environ.get('GCLOUD_REGION')
PIPELINE_RUNNER_APP_VERSION = os.environ.get('PIPELINE_RUNNER_APP_VERSION', 'latest')

# Feature Flags
ACCESS_PRIVATE_REFERENCE_DATASETS = (
Expand All @@ -62,7 +71,12 @@ class Env:
CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS
CLINGEN_ALLELE_REGISTRY_LOGIN: str | None = CLINGEN_ALLELE_REGISTRY_LOGIN
CLINGEN_ALLELE_REGISTRY_PASSWORD: str | None = CLINGEN_ALLELE_REGISTRY_PASSWORD
DEPLOYMENT_TYPE: Literal['dev', 'prod'] = DEPLOYMENT_TYPE
EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS
GCLOUD_DATAPROC_SECONDARY_WORKERS: str = GCLOUD_DATAPROC_SECONDARY_WORKERS
GCLOUD_PROJECT: str | None = GCLOUD_PROJECT
GCLOUD_ZONE: str | None = GCLOUD_ZONE
GCLOUD_REGION: str | None = GCLOUD_REGION
GRCH37_TO_GRCH38_LIFTOVER_REF_PATH: str = GRCH37_TO_GRCH38_LIFTOVER_REF_PATH
GRCH38_TO_GRCH37_LIFTOVER_REF_PATH: str = GRCH38_TO_GRCH37_LIFTOVER_REF_PATH
HAIL_BACKEND_SERVICE_HOSTNAME: str | None = HAIL_BACKEND_SERVICE_HOSTNAME
Expand All @@ -71,6 +85,7 @@ class Env:
HAIL_SEARCH_DATA_DIR: str = HAIL_SEARCH_DATA_DIR
INCLUDE_PIPELINE_VERSION_IN_PREFIX: bool = INCLUDE_PIPELINE_VERSION_IN_PREFIX
LOADING_DATASETS_DIR: str = LOADING_DATASETS_DIR
PIPELINE_RUNNER_APP_VERSION: str = PIPELINE_RUNNER_APP_VERSION
PRIVATE_REFERENCE_DATASETS_DIR: str = PRIVATE_REFERENCE_DATASETS_DIR
REFERENCE_DATASETS_DIR: str = REFERENCE_DATASETS_DIR
SHOULD_TRIGGER_HAIL_BACKEND_RELOAD: bool = SHOULD_TRIGGER_HAIL_BACKEND_RELOAD
Expand Down
Empty file.
Loading
Loading