Skip to content

Commit 81015b2

Browse files
committed
merge
2 parents 8b58c01 + adb0013 commit 81015b2

File tree

194 files changed

+61
-44
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

194 files changed

+61
-44
lines changed

v03_pipeline/lib/misc/terra_data_repository.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def _tdr_request(resource: str) -> dict:
3131

3232

3333
def _get_dataset_ids() -> list[str]:
34-
res_body = _tdr_request('datasets')
34+
res_body = _tdr_request('datasets?limit=50000') # Arbitrary large number :/
3535
items = res_body['items']
3636
for item in items:
3737
if not any(x['cloudResource'] == BIGQUERY_RESOURCE for x in item['storage']):

v03_pipeline/lib/misc/terra_data_repository_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ class TerraDataRepositoryTest(unittest.TestCase):
137137
@responses.activate
138138
def test_get_dataset_ids(self, _: Mock) -> None:
139139
responses.get(
140-
os.path.join(TDR_ROOT_URL, 'datasets'),
140+
os.path.join(TDR_ROOT_URL, 'datasets?limit=50000'),
141141
body=json.dumps(
142142
{
143143
'total': 3,

v03_pipeline/lib/reference_datasets/exac_test.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,9 @@ def test_exac(self):
5252
),
5353
],
5454
)
55+
self.assertEqual(
56+
ht.globals.collect(),
57+
[
58+
hl.Struct(version='1.1', enums=hl.Struct()),
59+
],
60+
)

v03_pipeline/lib/reference_datasets/gnomad_mito.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,6 @@ def get_ht(path: str, *_) -> hl.Table:
99
AC_het=hl.int32(ht.AC_het),
1010
AF_hom=ht.AF_hom,
1111
AF_het=ht.AF_het,
12-
max_hl=ht.max_hl,
12+
max_hl=hl.float32(ht.max_hl),
1313
)
1414
return ht.select_globals()

v03_pipeline/lib/reference_datasets/reference_dataset.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def enums(self) -> dict | None:
9090
def enum_globals(self) -> hl.Struct:
9191
if self.enums:
9292
return hl.Struct(**self.enums)
93-
return hl.missing(hl.tstruct(hl.tstr, hl.tarray(hl.tstr)))
93+
return hl.Struct()
9494

9595
@property
9696
def filter( # noqa: A003
@@ -263,15 +263,15 @@ def get_ht(
263263
},
264264
ReferenceGenome.GRCh37: {
265265
DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]),
266-
VERSION: '1.0',
266+
VERSION: '1.1',
267267
PATH: [
268268
'gs://seqr-reference-data/GRCh37/spliceai/spliceai_scores.masked.snv.hg19.vcf.gz',
269269
'gs://seqr-reference-data/GRCh37/spliceai/spliceai_scores.masked.indel.hg19.vcf.gz',
270270
],
271271
},
272272
ReferenceGenome.GRCh38: {
273273
DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]),
274-
VERSION: '1.0',
274+
VERSION: '1.1',
275275
# NB: SpliceAI data is only available to download for authenticated Illumina users, so we will host the data
276276
PATH: [
277277
'gs://seqr-reference-data/GRCh38/spliceai/spliceai_scores.masked.snv.hg38.vcf.gz',
@@ -369,7 +369,7 @@ def get_ht(
369369
ReferenceDataset.gnomad_mito: {
370370
ReferenceGenome.GRCh38: {
371371
DATASET_TYPES: frozenset([DatasetType.MITO]),
372-
VERSION: '1.0',
372+
VERSION: '1.1',
373373
PATH: 'gs://gcp-public-data--gnomad/release/3.1/ht/genomes/gnomad.genomes.v3.1.sites.chrM.ht',
374374
},
375375
},

v03_pipeline/lib/reference_datasets/splice_ai.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,26 @@
99
from v03_pipeline.lib.reference_datasets.misc import vcf_to_ht
1010

1111

12+
def remove_duplicate_scores(ht: hl.Table):
13+
#
14+
# SpliceAI has many duplicate rows of the ilk:
15+
#
16+
# 1:861264 | ["C","A"] | NA | -1.00e+01 | NA | ["A|AL645608.1|0.00|0.00|0.00|0.00|2|27|12|1"] |
17+
# 1:861264 | ["C","A"] | NA | -1.00e+01 | NA | ["A|SAMD11|0.02|0.01|0.00|0.00|14|38|14|38"]
18+
#
19+
count_ht = ht.group_by(*ht.key).aggregate(n=hl.agg.count())
20+
duplicate_variants_ht = count_ht.filter(count_ht.n > 1)
21+
duplicates_ht = ht.semi_join(duplicate_variants_ht)
22+
non_duplicates_ht = ht.anti_join(duplicates_ht)
23+
return non_duplicates_ht.union(
24+
# Remove rows that 1) are part of duplicate variant groupings
25+
# and 2) contain dots. Then, remove arbitrarily with .distinct()
26+
duplicates_ht.filter(
27+
~duplicates_ht.info.SpliceAI[0].split(delim='\\|')[1].contains('.'),
28+
).distinct(),
29+
)
30+
31+
1232
def get_ht(
1333
paths: list[str],
1434
reference_genome: ReferenceGenome,
@@ -26,6 +46,7 @@ def get_ht(
2646
# of partititons.
2747
)
2848
ht, _ = checkpoint(ht)
49+
ht = remove_duplicate_scores(ht)
2950

3051
# SpliceAI INFO field description from the VCF header: SpliceAIv1.3 variant annotation. These include
3152
# delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and
@@ -39,7 +60,7 @@ def get_ht(
3960
.map(hl.float32),
4061
)
4162
ht = ht.annotate(delta_score=hl.max(ht.delta_scores))
42-
ht = ht.annotate(
63+
return ht.annotate(
4364
splice_consequence_id=hl.if_else(
4465
ht.delta_score > 0,
4566
# Splice Consequence enum ID is the index of the max score
@@ -48,6 +69,3 @@ def get_ht(
4869
num_delta_scores,
4970
),
5071
).drop('delta_scores')
51-
return ht.group_by(*ht.key).aggregate(
52-
splice_consequence_id=hl.agg.min(ht.splice_consequence_id),
53-
)

v03_pipeline/lib/reference_datasets/topmed.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515

1616
def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table:
1717
ht = vcf_to_ht(path, reference_genome)
18-
ht = ht.filter(ht.locus.position == hl.int(ht.info.OriginalStart))
18+
if reference_genome == ReferenceGenome.GRCh37:
19+
ht = ht.filter(ht.locus.position == hl.int(ht.info.OriginalStart))
1920
return ht.select(
2021
**{k: parse_nested_field(ht, v) for k, v in SELECT.items()},
2122
)

v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def test_update_vat_snv_indel_38(
146146
eigen='1.1',
147147
clinvar='2024-11-11',
148148
exac='1.1',
149-
splice_ai='1.0',
149+
splice_ai='1.1',
150150
topmed='1.1',
151151
hgmd='1.0',
152152
gnomad_exomes='1.0',
@@ -297,7 +297,7 @@ def test_update_vat_mito_38(
297297
hmtvar='1.1',
298298
mitimpact='1.0',
299299
mitomap='1.0',
300-
gnomad_mito='1.0',
300+
gnomad_mito='1.1',
301301
local_constraint_mito='1.0',
302302
clinvar='2024-11-11',
303303
dbnsfp='1.0',
@@ -425,7 +425,7 @@ def test_update_vat_snv_indel_37(
425425
eigen='1.1',
426426
clinvar='2024-11-11',
427427
exac='1.1',
428-
splice_ai='1.0',
428+
splice_ai='1.1',
429429
topmed='1.1',
430430
hgmd='1.0',
431431
gnomad_exomes='1.0',

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,7 @@ def test_multiple_update_vat(
507507
exac='1.1',
508508
gnomad_exomes='1.0',
509509
gnomad_genomes='1.0',
510-
splice_ai='1.0',
510+
splice_ai='1.1',
511511
topmed='1.1',
512512
gnomad_non_coding_constraint='1.0',
513513
screen='1.0',
@@ -772,7 +772,7 @@ def test_update_vat_without_accessing_private_datasets(
772772
exac='1.1',
773773
gnomad_exomes='1.0',
774774
gnomad_genomes='1.0',
775-
splice_ai='1.0',
775+
splice_ai='1.1',
776776
topmed='1.1',
777777
gnomad_non_coding_constraint='1.0',
778778
screen='1.0',
@@ -827,7 +827,7 @@ def test_mito_update_vat(
827827
versions=hl.Struct(
828828
clinvar='2024-11-11',
829829
dbnsfp='1.0',
830-
gnomad_mito='1.0',
830+
gnomad_mito='1.1',
831831
helix_mito='1.0',
832832
hmtvar='1.1',
833833
mitomap='1.0',

v03_pipeline/lib/tasks/write_tdr_metrics_file.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import csv
22

3+
import google.api_core.exceptions
34
import luigi
45
import luigi.util
56

7+
from v03_pipeline.lib.logger import get_logger
68
from v03_pipeline.lib.misc.terra_data_repository import (
79
BIGQUERY_METRICS,
810
bq_metrics_query,
@@ -13,6 +15,8 @@
1315
)
1416
from v03_pipeline.lib.tasks.files import GCSorLocalTarget
1517

18+
logger = get_logger(__name__)
19+
1620

1721
@luigi.util.inherits(BaseLoadingPipelineParams)
1822
class WriteTDRMetricsFileTask(luigi.Task):
@@ -31,5 +35,8 @@ def run(self):
3135
with self.output().open('w') as f:
3236
writer = csv.DictWriter(f, fieldnames=BIGQUERY_METRICS, delimiter='\t')
3337
writer.writeheader()
34-
for row in bq_metrics_query(self.bq_table_name):
35-
writer.writerow(row)
38+
try:
39+
for row in bq_metrics_query(self.bq_table_name):
40+
writer.writerow(row)
41+
except google.api_core.exceptions.BadRequest:
42+
logger.exception('Query Failed')

0 commit comments

Comments
 (0)