Skip to content

Commit 82140db

Browse files
authored
allele registry (#759)
* add allele registry step in update vat with samples task * shh * existing tests pass * fix test deps * test * annotation_dependencies * ruff * take out the zero check * fix requirements new task name * move vep into new variants task * only annotate lookup from callset_ht * clean up mocks * r * working * working? * not that * minor changes and test cases * most recent script * working version * fix the test * implement ht chunking * fix patches * fix patches * register now yields id map of returned caids * r * fix some tests * return a hail table instead * use __str__ * log to track variants we can't map back * move to gcs with flag * union ar_ht instead of a bunch of left joins to prevent CAID, CAID_1, CAID_2... * cleaner * it is all coming together now ' * gnomad ids for 37' * use genomicalleles and gnomad ids * secrets * secret * move stuff out of environment file * add more logging * fix test * fix the other test * ruff * test * comments * o
1 parent d77a4e6 commit 82140db

9 files changed

+659
-4
lines changed

requirements.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ hail==0.2.130
44
luigi>=3.4.0
55
gnomad==0.6.4
66
google-cloud-storage>=2.14.0
7+
google-cloud-secret-manager>=2.20.0

requirements.txt

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,11 @@ frozenlist==1.4.0
9797
# hail
9898
gnomad==0.6.4
9999
# via -r requirements.in
100-
google-api-core==2.14.0
100+
google-api-core[grpc]==2.14.0
101101
# via
102102
# google-api-python-client
103103
# google-cloud-core
104+
# google-cloud-secret-manager
104105
# google-cloud-storage
105106
google-api-python-client==2.108.0
106107
# via -r requirements.in
@@ -111,6 +112,7 @@ google-auth==2.23.4
111112
# google-auth-httplib2
112113
# google-auth-oauthlib
113114
# google-cloud-core
115+
# google-cloud-secret-manager
114116
# google-cloud-storage
115117
# hail
116118
google-auth-httplib2==0.1.1
@@ -119,6 +121,8 @@ google-auth-oauthlib==0.8.0
119121
# via hail
120122
google-cloud-core==2.4.1
121123
# via google-cloud-storage
124+
google-cloud-secret-manager==2.20.0
125+
# via -r requirements.in
122126
google-cloud-storage==2.14.0
123127
# via -r requirements.in
124128
google-crc32c==1.5.0
@@ -127,7 +131,20 @@ google-crc32c==1.5.0
127131
# google-resumable-media
128132
google-resumable-media==2.7.0
129133
# via google-cloud-storage
130-
googleapis-common-protos==1.61.0
134+
googleapis-common-protos[grpc]==1.61.0
135+
# via
136+
# google-api-core
137+
# grpc-google-iam-v1
138+
# grpcio-status
139+
grpc-google-iam-v1==0.13.0
140+
# via google-cloud-secret-manager
141+
grpcio==1.63.0
142+
# via
143+
# google-api-core
144+
# googleapis-common-protos
145+
# grpc-google-iam-v1
146+
# grpcio-status
147+
grpcio-status==1.48.2
131148
# via google-api-core
132149
hail==0.2.130
133150
# via -r requirements.in
@@ -226,10 +243,16 @@ portalocker==2.8.2
226243
# via msal-extensions
227244
prompt-toolkit==3.0.41
228245
# via ipython
246+
proto-plus==1.23.0
247+
# via google-cloud-secret-manager
229248
protobuf==3.20.2
230249
# via
231250
# google-api-core
251+
# google-cloud-secret-manager
232252
# googleapis-common-protos
253+
# grpc-google-iam-v1
254+
# grpcio-status
255+
# proto-plus
233256
ptyprocess==0.7.0
234257
# via pexpect
235258
pure-eval==0.2.2
@@ -251,7 +274,9 @@ pygments==2.17.2
251274
# ipython
252275
# rich
253276
pyjwt[crypto]==2.8.0
254-
# via msal
277+
# via
278+
# msal
279+
# pyjwt
255280
pyparsing==3.1.1
256281
# via httplib2
257282
pyspark==3.3.3
Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
import dataclasses
2+
import hashlib
3+
import json
4+
import math
5+
import time
6+
import uuid
7+
8+
import hail as hl
9+
import hailtop.fs as hfs
10+
import requests
11+
from google.cloud import secretmanager
12+
from requests import HTTPError
13+
14+
from v03_pipeline.lib.logger import get_logger
15+
from v03_pipeline.lib.model import Env, ReferenceGenome
16+
17+
MAX_VARIANTS_PER_REQUEST = 1000000
18+
ALLELE_REGISTRY_URL = 'https://reg.genome.network/alleles?file=vcf&fields=none+@id+genomicAlleles+externalRecords.{}.id'
19+
HTTP_REQUEST_TIMEOUT_S = 420
20+
21+
logger = get_logger(__name__)
22+
23+
24+
@dataclasses.dataclass
25+
class AlleleRegistryError:
26+
base_url: str
27+
error_type: str
28+
description: str
29+
message: str
30+
input_line: str | None
31+
32+
@classmethod
33+
def from_api_response(cls, response: dict, base_url: str):
34+
return cls(
35+
base_url=base_url,
36+
error_type=response['errorType'],
37+
description=response['description'],
38+
message=response['message'],
39+
input_line=response.get('inputLine'),
40+
)
41+
42+
def __str__(self) -> str:
43+
msg = (
44+
f'\nAPI URL: {self.base_url}\nTYPE: {self.error_type}'
45+
f'\nDESCRIPTION: {self.description}\nMESSAGE: {self.message}'
46+
)
47+
return (
48+
msg if self.input_line is None else f'{msg}\nINPUT_LINE: {self.input_line}'
49+
)
50+
51+
52+
def register_alleles_in_chunks(
53+
ht: hl.Table,
54+
reference_genome: ReferenceGenome,
55+
base_url: str = ALLELE_REGISTRY_URL,
56+
chunk_size: int = MAX_VARIANTS_PER_REQUEST,
57+
):
58+
num_rows = ht.count()
59+
num_chunks = math.ceil(num_rows / chunk_size)
60+
logger.info(
61+
f'Registering {num_rows} allele(s) in chunks of {chunk_size} in {num_chunks} request(s).',
62+
)
63+
for start_idx in range(0, num_rows, chunk_size):
64+
end_idx = start_idx + chunk_size
65+
if end_idx == chunk_size:
66+
chunk_ht = ht.head(chunk_size)
67+
elif end_idx <= num_rows:
68+
chunk_ht = ht.head(end_idx).tail(chunk_size)
69+
else:
70+
chunk_ht = ht.tail(end_idx - num_rows)
71+
yield register_alleles(chunk_ht, reference_genome, base_url)
72+
73+
74+
def register_alleles(
75+
ht: hl.Table,
76+
reference_genome: ReferenceGenome,
77+
base_url: str,
78+
) -> hl.Table:
79+
uuid4 = uuid.uuid4()
80+
raw_vcf_file_name = f'{Env.HAIL_TMPDIR}/r_{uuid4}.vcf'
81+
formatted_vcf_file_name = f'{Env.HAIL_TMPDIR}/f_{uuid4}.vcf'
82+
83+
# Export the variants to a VCF
84+
hl.export_vcf(ht, raw_vcf_file_name)
85+
86+
# Reformat the VCF created by hail's 'export_vcf' function to be compatible with the Allele Registry
87+
with hfs.open(raw_vcf_file_name, 'r') as vcf_in, hfs.open(
88+
formatted_vcf_file_name,
89+
'w',
90+
) as vcf_out:
91+
vcf_out.writelines(reference_genome.allele_registry_vcf_header)
92+
for line in vcf_in:
93+
if not line.startswith('#'):
94+
# NB: The Allele Registry does not accept contigs prefixed with 'chr', even for GRCh38
95+
vcf_out.write(line.replace('chr', ''))
96+
97+
logger.info('Calling the ClinGen Allele Registry')
98+
with hfs.open(formatted_vcf_file_name, 'r') as vcf_in:
99+
data = vcf_in.read()
100+
res = requests.put(
101+
url=build_url(base_url, reference_genome),
102+
data=data,
103+
timeout=HTTP_REQUEST_TIMEOUT_S,
104+
)
105+
return handle_api_response(res, base_url, reference_genome)
106+
107+
108+
def build_url(base_url: str, reference_genome: ReferenceGenome) -> str:
109+
login, password = get_ar_credentials_from_secret_manager()
110+
111+
# Request a gnomad ID for the correct reference genome
112+
base_url = base_url.format(reference_genome.allele_registry_gnomad_id)
113+
114+
# adapted from https://reg.clinicalgenome.org/doc/scripts/request_with_payload.py
115+
identity = hashlib.sha1((login + password).encode('utf-8')).hexdigest() # noqa: S324
116+
gb_time = str(int(time.time()))
117+
token = hashlib.sha1((base_url + identity + gb_time).encode('utf-8')).hexdigest() # noqa: S324
118+
return base_url + '&gbLogin=' + login + '&gbTime=' + gb_time + '&gbToken=' + token
119+
120+
121+
def get_ar_credentials_from_secret_manager() -> tuple[str, str]:
122+
if Env.ALLELE_REGISTRY_SECRET_NAME is None:
123+
msg = (
124+
'SHOULD_REGISTER_ALLELES is True but cannot get allele registry credentials '
125+
'because ALLELE_REGISTRY_SECRET_NAME is not set'
126+
)
127+
raise ValueError(msg)
128+
129+
client = secretmanager.SecretManagerServiceClient()
130+
name = client.secret_version_path(
131+
Env.PROJECT_ID,
132+
Env.ALLELE_REGISTRY_SECRET_NAME,
133+
'latest',
134+
)
135+
response = client.access_secret_version(request={'name': name})
136+
payload_dict = json.loads(response.payload.data.decode('UTF-8'))
137+
return payload_dict['login'], payload_dict['password']
138+
139+
140+
def handle_api_response(
141+
res: requests.Response,
142+
base_url: str,
143+
reference_genome: ReferenceGenome,
144+
) -> hl.Table:
145+
response = res.json()
146+
if not res.ok or 'errorType' in response:
147+
error = AlleleRegistryError.from_api_response(response, base_url)
148+
logger.error(error)
149+
raise HTTPError(error.message)
150+
151+
parsed_structs = []
152+
errors = []
153+
unmappable_variants = []
154+
for allele_response in response:
155+
if 'errorType' in allele_response:
156+
errors.append(
157+
AlleleRegistryError.from_api_response(allele_response, base_url),
158+
)
159+
continue
160+
161+
# Extract CAID and allele info
162+
caid = allele_response['@id'].split('/')[-1]
163+
allele_info = next(
164+
record
165+
for record in allele_response['genomicAlleles']
166+
if record['referenceGenome'] == reference_genome.value
167+
)
168+
chrom = allele_info['chromosome']
169+
pos = allele_info['coordinates'][0]['end']
170+
ref = allele_info['coordinates'][0]['referenceAllele']
171+
alt = allele_info['coordinates'][0]['allele']
172+
173+
if ref == '' or alt == '':
174+
# AR will turn alleles like ["A","ATT"] to ["", "TT"] so try using gnomad IDs instead
175+
if 'externalRecords' in allele_response:
176+
gnomad_id = allele_response['externalRecords'][
177+
reference_genome.allele_registry_gnomad_id
178+
][0]['id']
179+
chrom, pos, ref, alt = gnomad_id.split('-')
180+
else:
181+
unmappable_variants.append(allele_response)
182+
continue
183+
184+
struct = hl.Struct(
185+
locus=hl.Locus(
186+
f'chr{chrom}' if reference_genome == ReferenceGenome.GRCh38 else chrom,
187+
int(pos),
188+
reference_genome=reference_genome.value,
189+
),
190+
alleles=[ref, alt],
191+
CAID=caid,
192+
)
193+
parsed_structs.append(struct)
194+
195+
logger.info(
196+
f'{len(response) - len(errors)} out of {len(response)} variants returned CAID(s)',
197+
)
198+
logger.info(
199+
f'{len(unmappable_variants)} registered variant(s) cannot be mapped back to ours. '
200+
f'\nFirst unmappable variant:\n{unmappable_variants[0]}',
201+
)
202+
if errors:
203+
logger.warning(
204+
f'{len(errors)} failed. First error: {errors[0]}',
205+
)
206+
return hl.Table.parallelize(
207+
parsed_structs,
208+
hl.tstruct(
209+
locus=hl.tlocus(reference_genome.value),
210+
alleles=hl.tarray(hl.tstr),
211+
CAID=hl.tstr,
212+
),
213+
key=('locus', 'alleles'),
214+
)

0 commit comments

Comments
 (0)