Skip to content

Commit fe49297

Browse files
committed
merge
2 parents dbcd50c + 16f71ac commit fe49297

File tree

2 files changed

+97
-2
lines changed

2 files changed

+97
-2
lines changed

requirements.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ parso==0.8.3
218218
# via jedi
219219
pexpect==4.9.0
220220
# via ipython
221-
pillow==10.2.0
221+
pillow==10.3.0
222222
# via bokeh
223223
plotly==5.18.0
224224
# via hail
@@ -252,7 +252,9 @@ pygments==2.17.2
252252
# ipython
253253
# rich
254254
pyjwt[crypto]==2.8.0
255-
# via msal
255+
# via
256+
# msal
257+
# pyjwt
256258
pyparsing==3.1.1
257259
# via httplib2
258260
pyspark==3.3.3
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
4+
import hail as hl
5+
6+
from v03_pipeline.lib.misc.io import write
7+
from v03_pipeline.lib.model import (
8+
CachedReferenceDatasetQuery,
9+
DatasetType,
10+
ReferenceDatasetCollection,
11+
ReferenceGenome,
12+
)
13+
from v03_pipeline.lib.paths import (
14+
valid_cached_reference_dataset_query_path,
15+
valid_reference_dataset_collection_path,
16+
)
17+
from v03_pipeline.lib.reference_data.config import CONFIG
18+
from v03_pipeline.lib.reference_data.dataset_table_operations import (
19+
import_ht_from_config_path,
20+
)
21+
22+
23+
def get_ht(
24+
dataset_type: DatasetType,
25+
reference_genome: ReferenceGenome,
26+
query: CachedReferenceDatasetQuery,
27+
) -> hl.Table:
28+
# If the query is defined over an uncombined reference dataset, use the combiner config.
29+
if query.query_raw_dataset:
30+
config = CONFIG[query.dataset(dataset_type)][reference_genome.v02_value]
31+
return import_ht_from_config_path(
32+
config,
33+
query.dataset(dataset_type),
34+
reference_genome,
35+
)
36+
return hl.read_table(
37+
valid_reference_dataset_collection_path(
38+
reference_genome,
39+
dataset_type,
40+
ReferenceDatasetCollection.COMBINED,
41+
),
42+
)
43+
44+
45+
def run(
46+
dataset_type: DatasetType,
47+
reference_genome: ReferenceGenome,
48+
query: CachedReferenceDatasetQuery,
49+
):
50+
ht = get_ht(dataset_type, reference_genome, query)
51+
ht = query.query(ht, dataset_type=dataset_type, reference_genome=reference_genome)
52+
destination_path = valid_cached_reference_dataset_query_path(
53+
reference_genome,
54+
dataset_type,
55+
query,
56+
)
57+
print(f'Uploading ht to {destination_path}')
58+
write(ht, destination_path)
59+
60+
61+
if __name__ == '__main__':
62+
parser = argparse.ArgumentParser()
63+
parser.add_argument(
64+
'--reference-genome',
65+
type=ReferenceGenome,
66+
choices=list(ReferenceGenome),
67+
default=ReferenceGenome.GRCh38,
68+
)
69+
parser.add_argument(
70+
'--dataset-type',
71+
type=DatasetType,
72+
choices=list(DatasetType),
73+
default=None,
74+
help='When used, update the passed dataset, otherwise run all datasets.',
75+
)
76+
parser.add_argument(
77+
'--query',
78+
type=CachedReferenceDatasetQuery,
79+
choices=list(CachedReferenceDatasetQuery),
80+
required=True,
81+
)
82+
args, _ = parser.parse_known_args()
83+
if (
84+
args.query
85+
and args.query
86+
not in CachedReferenceDatasetQuery.for_reference_genome_dataset_type(
87+
args.reference_genome,
88+
args.dataset_type,
89+
)
90+
):
91+
msg = f'{args.query} is not a valid query for {DatasetType}'
92+
raise ValueError(msg)
93+
run(args.dataset_type, args.reference_genome, args.query)

0 commit comments

Comments
 (0)