Skip to content

Commit 2dd9316

Browse files
authored
Merge pull request #885 from broadinstitute/dev
Dev
2 parents 8f832ee + c4050a7 commit 2dd9316

27 files changed

+427
-60
lines changed

.github/workflows/dev-release.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,5 +45,4 @@ jobs:
4545
run: |-
4646
gcloud storage rm -r gs://seqr-luigi/releases/dev/latest/ || echo 'No latest release'
4747
gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/dev/latest/bin/
48-
gcloud storage cp v03_pipeline/var/vep_config/* gs://seqr-luigi/releases/dev/latest/var/vep_config
4948
gcloud storage cp dist/*.whl gs://seqr-luigi/releases/dev/latest/pyscripts.zip

.github/workflows/prod-release.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,10 @@ jobs:
5353
run: |-
5454
gcloud storage rm -r gs://seqr-luigi/releases/prod/latest/ || echo 'No latest release'
5555
gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/prod/latest/bin/
56-
gcloud storage cp v03_pipeline/var/vep_config/* gs://seqr-luigi/releases/prod/latest/var/vep_config
5756
gcloud storage cp dist/*.whl gs://seqr-luigi/releases/prod/latest/pyscripts.zip
5857
gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/prod/$TAG_NAME/bin/
59-
gcloud storage cp v03_pipeline/var/vep_config/* gs://seqr-luigi/releases/prod/$TAG_NAME/var/vep_config
6058
gcloud storage cp dist/*.whl gs://seqr-luigi/releases/prod/$TAG_NAME/pyscripts.zip
59+
gcloud storage cp v03_pipeline/var/vep/* gs://seqr-reference-data/vep/
6160
6261
- name: Create tag
6362
uses: actions/github-script@v7

requirements.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,5 @@ luigi>=3.4.0
55
gnomad==0.6.4
66
google-cloud-storage>=2.14.0
77
google-cloud-secret-manager>=2.20.0
8+
aiofiles==24.1.0
9+
pydantic==2.8.2

v03_pipeline/api/app.py

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,80 @@
1-
from aiohttp import web
1+
import json
2+
import os
3+
import traceback
24

3-
from v03_pipeline.lib.tasks import * # noqa: F403
5+
import aiofiles
6+
import aiofiles.os
7+
from aiohttp import web, web_exceptions
8+
9+
from v03_pipeline.api.model import LoadingPipelineRequest
10+
from v03_pipeline.lib.logger import get_logger
11+
from v03_pipeline.lib.paths import loading_pipeline_queue_path
12+
13+
logger = get_logger(__name__)
14+
15+
16+
@web.middleware
17+
async def error_middleware(request, handler):
18+
try:
19+
return await handler(request)
20+
except web.HTTPError:
21+
logger.exception('HTTPError')
22+
raise
23+
except Exception as e:
24+
logger.exception('Unhandled Exception')
25+
error_reason = f'{e}: {traceback.format_exc()}'
26+
raise web.HTTPInternalServerError(reason=error_reason) from e
27+
28+
29+
async def loading_pipeline_enqueue(request: web.Request) -> web.Response:
30+
if not request.body_exists:
31+
raise web.HTTPUnprocessableEntity
32+
33+
try:
34+
lpr = LoadingPipelineRequest.model_validate(await request.json())
35+
except ValueError as e:
36+
raise web.HTTPBadRequest from e
37+
38+
try:
39+
async with aiofiles.open(loading_pipeline_queue_path(), 'r') as f:
40+
return web.json_response(
41+
{
42+
'Failed to queue due to in process request': json.loads(
43+
await f.read(),
44+
),
45+
},
46+
#
47+
# The 409 (Conflict) status code indicates that the request
48+
# could not be completed due to a conflict with the current
49+
# state of the target resource.
50+
#
51+
status=web_exceptions.HTTPConflict.status_code,
52+
)
53+
except FileNotFoundError:
54+
pass
55+
56+
async with aiofiles.open(loading_pipeline_queue_path(), 'w') as f:
57+
await f.write(lpr.model_dump_json())
58+
return web.json_response(
59+
{'Successfully queued': lpr.model_dump()},
60+
status=web_exceptions.HTTPAccepted.status_code,
61+
)
462

563

664
async def status(_: web.Request) -> web.Response:
765
return web.json_response({'success': True})
866

967

1068
async def init_web_app():
11-
app = web.Application()
69+
await aiofiles.os.makedirs(
70+
os.path.dirname(loading_pipeline_queue_path()),
71+
exist_ok=True,
72+
)
73+
app = web.Application(middlewares=[error_middleware])
1274
app.add_routes(
1375
[
1476
web.get('/status', status),
77+
web.post('/loading_pipeline_enqueue', loading_pipeline_enqueue),
1578
],
1679
)
1780
return app

v03_pipeline/api/app_test.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
from aiohttp import web_exceptions
2+
from aiohttp.test_utils import AioHTTPTestCase
3+
4+
from v03_pipeline.api.app import init_web_app
5+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
6+
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
7+
8+
CALLSET_PATH = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
9+
10+
11+
class AppTest(AioHTTPTestCase, MockedDatarootTestCase):
12+
async def get_application(self):
13+
return await init_web_app()
14+
15+
async def test_status(self):
16+
async with self.client.request('GET', '/status') as resp:
17+
self.assertEqual(resp.status, 200)
18+
resp_json = await resp.json()
19+
self.assertDictEqual(resp_json, {'success': True})
20+
21+
async def test_missing_route(self):
22+
with self.assertLogs(level='ERROR') as _:
23+
async with self.client.request('GET', '/loading_pip') as resp:
24+
self.assertEqual(resp.status, web_exceptions.HTTPNotFound.status_code)
25+
26+
async def test_loading_pipeline_invalid_requests(self):
27+
with self.assertLogs(level='ERROR') as log:
28+
async with self.client.request('GET', '/loading_pipeline_enqueue') as resp:
29+
self.assertEqual(
30+
resp.status,
31+
web_exceptions.HTTPMethodNotAllowed.status_code,
32+
)
33+
self.assertTrue(
34+
'HTTPMethodNotAllowed' in log.output[0],
35+
)
36+
37+
with self.assertLogs(level='ERROR') as log:
38+
async with self.client.request('POST', '/loading_pipeline_enqueue') as resp:
39+
self.assertEqual(
40+
resp.status,
41+
web_exceptions.HTTPUnprocessableEntity.status_code,
42+
)
43+
self.assertTrue(
44+
'HTTPUnprocessableEntity' in log.output[0],
45+
)
46+
47+
body = {
48+
'callset_path': 'missing.vcf',
49+
'projects_to_run': ['project_a'],
50+
'sample_type': SampleType.WGS.value,
51+
'reference_genome': ReferenceGenome.GRCh38.value,
52+
'dataset_type': DatasetType.SNV_INDEL.value,
53+
}
54+
with self.assertLogs(level='ERROR') as log:
55+
async with self.client.request(
56+
'POST',
57+
'/loading_pipeline_enqueue',
58+
json=body,
59+
) as resp:
60+
self.assertEqual(
61+
resp.status,
62+
web_exceptions.HTTPBadRequest.status_code,
63+
)
64+
self.assertTrue(
65+
'callset_path must point to a file that exists' in log.output[0],
66+
)
67+
68+
async def test_loading_pipeline_enqueue(self):
69+
body = {
70+
'callset_path': CALLSET_PATH,
71+
'projects_to_run': ['project_a'],
72+
'sample_type': SampleType.WGS.value,
73+
'reference_genome': ReferenceGenome.GRCh38.value,
74+
'dataset_type': DatasetType.SNV_INDEL.value,
75+
}
76+
async with self.client.request(
77+
'POST',
78+
'/loading_pipeline_enqueue',
79+
json=body,
80+
) as resp:
81+
self.assertEqual(
82+
resp.status,
83+
web_exceptions.HTTPAccepted.status_code,
84+
)
85+
resp_json = await resp.json()
86+
self.assertDictEqual(
87+
resp_json,
88+
{
89+
'Successfully queued': {
90+
'callset_path': 'v03_pipeline/var/test/callsets/1kg_30variants.vcf',
91+
'dataset_type': 'SNV_INDEL',
92+
'ignore_missing_samples_when_remapping': False,
93+
'projects_to_run': ['project_a'],
94+
'reference_genome': 'GRCh38',
95+
'sample_type': 'WGS',
96+
'skip_validation': False,
97+
},
98+
},
99+
)
100+
101+
# Second request
102+
body['projects_to_run'] = ['project_b', 'project_c']
103+
async with self.client.request(
104+
'POST',
105+
'/loading_pipeline_enqueue',
106+
json=body,
107+
) as resp:
108+
self.assertEqual(
109+
resp.status,
110+
web_exceptions.HTTPConflict.status_code,
111+
)

v03_pipeline/api/model.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import hailtop.fs as hfs
2+
from pydantic import BaseModel, Field, field_validator
3+
4+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
5+
6+
VALID_FILE_TYPES = ['vcf', 'vcf.gz', 'vcf.bgz', 'mt']
7+
8+
9+
class LoadingPipelineRequest(BaseModel):
10+
callset_path: str
11+
projects_to_run: list[str] = Field(min_length=1, frozen=True)
12+
sample_type: SampleType
13+
reference_genome: ReferenceGenome
14+
dataset_type: DatasetType
15+
ignore_missing_samples_when_remapping: bool = False
16+
skip_validation: bool = False
17+
18+
@field_validator('callset_path')
19+
@classmethod
20+
def check_valid_callset_path(cls, callset_path: str) -> str:
21+
if not any(callset_path.endswith(file_type) for file_type in VALID_FILE_TYPES):
22+
msg = 'callset_path must be a VCF or a Hail Matrix Table'
23+
raise ValueError(msg)
24+
if not hfs.exists(callset_path):
25+
msg = 'callset_path must point to a file that exists'
26+
raise ValueError(msg)
27+
return callset_path

v03_pipeline/api/model_test.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import unittest
2+
3+
from v03_pipeline.api.model import LoadingPipelineRequest
4+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
5+
6+
TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
7+
8+
9+
class ModelTest(unittest.TestCase):
10+
def test_valid_loading_pipeline_requests(self) -> None:
11+
raw_request = {
12+
'callset_path': TEST_VCF,
13+
'projects_to_run': ['project_a'],
14+
'sample_type': SampleType.WGS.value,
15+
'reference_genome': ReferenceGenome.GRCh38.value,
16+
'dataset_type': DatasetType.SNV_INDEL.value,
17+
}
18+
lpr = LoadingPipelineRequest.model_validate(raw_request)
19+
self.assertEqual(lpr.reference_genome, ReferenceGenome.GRCh38)
20+
21+
def test_invalid_loading_pipeline_requests(self) -> None:
22+
raw_request = {
23+
'callset_path': 'a.txt',
24+
'projects_to_run': [],
25+
'sample_type': 'BLENDED',
26+
'reference_genome': ReferenceGenome.GRCh38.value,
27+
'dataset_type': DatasetType.SNV_INDEL.value,
28+
}
29+
with self.assertRaises(ValueError) as cm:
30+
LoadingPipelineRequest.model_validate(raw_request)
31+
self.assertTrue(
32+
str(cm.exception).startswith(
33+
'3 validation errors for LoadingPipelineRequest',
34+
),
35+
)

v03_pipeline/bin/dataproc_vep_init.bash

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ set -x
1515

1616
export PROJECT="$(gcloud config get-value project)"
1717
export ENVIRONMENT="$(/usr/share/google/get_metadata_value attributes/ENVIRONMENT)"
18-
export VEP_CONFIG_PATH="$(/usr/share/google/get_metadata_value attributes/VEP_CONFIG_PATH)"
1918
export REFERENCE_GENOME="$(/usr/share/google/get_metadata_value attributes/REFERENCE_GENOME)"
2019

2120
# Install docker
@@ -37,8 +36,6 @@ apt-get install -y --allow-unauthenticated docker-ce
3736
sleep 60
3837
sudo service docker restart
3938

40-
gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/var/vep_config/vep-$REFERENCE_GENOME.json $VEP_CONFIG_PATH
41-
4239
cat >/vep.c <<EOF
4340
#include <unistd.h>
4441
#include <stdio.h>

v03_pipeline/bin/download_vep_data.bash

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,16 @@ case $REFERENCE_GENOME in
2626

2727
# Copied from the UTRAnnotator repo (https://github.com/ImperialCardioGenetics/UTRannotator/tree/master)
2828
'gs://seqr-reference-data/vep/GRCh38/uORF_5UTR_GRCh38_PUBLIC.txt'
29+
30+
'gs://seqr-reference-data/vep/GRCh38/vep-GRCh38.json'
2931
)
3032
;;
3133
GRCh37)
3234
VEP_REFERENCE_DATA_FILES=(
3335
'gs://seqr-reference-data/vep_data/loftee-beta/GRCh37.tar.gz'
3436
'gs://seqr-reference-data/vep/GRCh37/homo_sapiens_vep_110_GRCh37.tar.gz'
3537
'gs://seqr-reference-data/vep/GRCh37/Homo_sapiens.GRCh37.dna.primary_assembly.fa.*'
38+
'gs://seqr-reference-data/vep/GRCh37/vep-GRCh37.json'
3639
)
3740
;;
3841
*)

0 commit comments

Comments
 (0)