1
1
"""Class for working with MMCIF files."""
2
+
2
3
# BioPandas
3
4
# Authors: Arian Jamasb <arian@jamasb.io>,
4
5
# Authors: Sebastian Raschka <mail@sebastianraschka.com>
@@ -130,56 +131,76 @@ def get_models(self, model_indices: List[int]) -> PandasMmcif:
130
131
]
131
132
return biopandas_structure
132
133
133
- def fetch_mmcif (self , pdb_code : Optional [str ] = None , uniprot_id : Optional [str ] = None , source : str = "pdb" ):
134
+ def fetch_mmcif (
135
+ self ,
136
+ pdb_code : Optional [str ] = None ,
137
+ uniprot_id : Optional [str ] = None ,
138
+ source : str = "pdb" ,
139
+ ):
134
140
"""Fetches mmCIF file contents from the Protein Databank at rcsb.org or AlphaFold database at https://alphafold.ebi.ac.uk/.
135
- .
141
+ .
136
142
137
- Parameters
138
- ----------
139
- pdb_code : str, optional
140
- A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the PDB. Defaults to `None`.
143
+ Parameters
144
+ ----------
145
+ pdb_code : str, optional
146
+ A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the PDB. Defaults to `None`.
141
147
142
- uniprot_id : str, optional
143
- A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from the AF2 database. Defaults to `None`.
148
+ uniprot_id : str, optional
149
+ A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from the AF2 database. Defaults to `None`.
144
150
145
- source : str
146
- The source to retrieve the structure from
147
- (`"pdb"`, `"alphafold2-v3"` or `"alphafold2-v4"`). Defaults to `"pdb"`.
151
+ source : str
152
+ The source to retrieve the structure from
153
+ (`"pdb"`, `"alphafold2-v3"` or `"alphafold2-v4"`). Defaults to `"pdb"`.
148
154
149
- Returns
150
- ---------
151
- self
155
+ Returns
156
+ ---------
157
+ self
152
158
153
159
"""
154
160
# Sanitize input
155
161
invalid_input_identifier_1 = pdb_code is None and uniprot_id is None
156
- invalid_input_identifier_2 = pdb_code is not None and uniprot_id is not None
157
- invalid_input_combination_1 = uniprot_id is not None and source == "pdb"
162
+ invalid_input_identifier_2 = (
163
+ pdb_code is not None and uniprot_id is not None
164
+ )
165
+ invalid_input_combination_1 = (
166
+ uniprot_id is not None and source == "pdb"
167
+ )
158
168
invalid_input_combination_2 = pdb_code is not None and source in {
159
- "alphafold2-v3" , "alphafold2-v4" }
169
+ "alphafold2-v3" ,
170
+ "alphafold2-v4" ,
171
+ }
160
172
161
173
if invalid_input_identifier_1 or invalid_input_identifier_2 :
162
174
raise ValueError (
163
- "Please provide either a PDB code or a UniProt ID." )
175
+ "Please provide either a PDB code or a UniProt ID."
176
+ )
164
177
165
178
if invalid_input_combination_1 :
166
179
raise ValueError (
167
- "Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'." )
180
+ "Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'."
181
+ )
168
182
elif invalid_input_combination_2 :
169
183
raise ValueError (
170
- f"Please use a 'uniprot_id' instead of 'pdb_code' for source={ source } ." )
184
+ f"Please use a 'uniprot_id' instead of 'pdb_code' for source={ source } ."
185
+ )
171
186
172
187
if source == "pdb" :
173
188
self .mmcif_path , self .mmcif_text = self ._fetch_mmcif (pdb_code )
174
189
elif source == "alphafold2-v3" :
175
190
af2_version = 3
176
- self .mmcif_path , self .mmcif_text = self ._fetch_af2 (uniprot_id , af2_version )
191
+ self .mmcif_path , self .mmcif_text = self ._fetch_af2 (
192
+ uniprot_id , af2_version
193
+ )
177
194
elif source == "alphafold2-v4" :
178
195
af2_version = 4
179
- self .mmcif_path , self .mmcif_text = self ._fetch_af2 (uniprot_id , af2_version )
196
+ self .mmcif_path , self .mmcif_text = self ._fetch_af2 (
197
+ uniprot_id , af2_version
198
+ )
180
199
else :
181
- raise ValueError (f"Invalid source: { source } ."
182
- " Please use one of 'pdb', 'alphafold2-v3' or 'alphafold2-v4'." )
200
+ raise ValueError (
201
+ f"Invalid source: { source } ."
202
+ " Please use one of 'pdb', 'alphafold2-v3' or 'alphafold2-v4'."
203
+ )
183
204
184
205
self ._df = self ._construct_df (text = self .mmcif_text )
185
206
return self
@@ -190,7 +211,8 @@ def _construct_df(self, text: str):
190
211
self .data = data
191
212
df : Dict [str , pd .DataFrame ] = {}
192
213
full_df = pd .DataFrame .from_dict (
193
- data ["atom_site" ], orient = "index" ).transpose ()
214
+ data ["atom_site" ], orient = "index"
215
+ ).transpose ()
194
216
full_df = full_df .astype (mmcif_col_types , errors = "ignore" )
195
217
df ["ATOM" ] = pd .DataFrame (full_df [full_df .group_PDB == "ATOM" ])
196
218
df ["HETATM" ] = pd .DataFrame (full_df [full_df .group_PDB == "HETATM" ])
@@ -209,8 +231,9 @@ def _fetch_mmcif(pdb_code):
209
231
response = urlopen (url )
210
232
txt = response .read ()
211
233
txt = (
212
- txt .decode (
213
- "utf-8" ) if sys .version_info [0 ] >= 3 else txt .encode ("ascii" )
234
+ txt .decode ("utf-8" )
235
+ if sys .version_info [0 ] >= 3
236
+ else txt .encode ("ascii" )
214
237
)
215
238
except HTTPError as e :
216
239
print (f"HTTP Error { e .code } " )
@@ -227,11 +250,15 @@ def _fetch_af2(uniprot_id: str, af2_version: int = 3):
227
250
try :
228
251
response = urlopen (url )
229
252
txt = response .read ()
230
- txt = txt .decode ('utf-8' ) if sys .version_info [0 ] >= 3 else txt .encode ('ascii' )
253
+ txt = (
254
+ txt .decode ("utf-8" )
255
+ if sys .version_info [0 ] >= 3
256
+ else txt .encode ("ascii" )
257
+ )
231
258
except HTTPError as e :
232
- print (f' HTTP Error { e .code } ' )
259
+ print (f" HTTP Error { e .code } " )
233
260
except URLError as e :
234
- print (f' URL Error { e .args } ' )
261
+ print (f" URL Error { e .args } " )
235
262
return url , txt
236
263
237
264
@staticmethod
@@ -245,7 +272,8 @@ def _read_mmcif(path):
245
272
openf = gzip .open
246
273
else :
247
274
allowed_formats = ", " .join (
248
- (".cif" , ".cif.gz" , ".mmcif" , ".mmcif.gz" ))
275
+ (".cif" , ".cif.gz" , ".mmcif" , ".mmcif.gz" )
276
+ )
249
277
raise ValueError (
250
278
f"Wrong file format; allowed file formats are { allowed_formats } "
251
279
)
@@ -255,8 +283,9 @@ def _read_mmcif(path):
255
283
256
284
if path .endswith (".gz" ):
257
285
txt = (
258
- txt .decode (
259
- "utf-8" ) if sys .version_info [0 ] >= 3 else txt .encode ("ascii" )
286
+ txt .decode ("utf-8" )
287
+ if sys .version_info [0 ] >= 3
288
+ else txt .encode ("ascii" )
260
289
)
261
290
return path , txt
262
291
@@ -332,14 +361,19 @@ def _get_mainchain(
332
361
def _get_hydrogen (df , invert ):
333
362
"""Return only hydrogen atom entries from a DataFrame"""
334
363
return (
335
- df [(df ["type_symbol" ] != "H" )] if invert else df [(
336
- df ["type_symbol" ] == "H" )]
364
+ df [(df ["type_symbol" ] != "H" )]
365
+ if invert
366
+ else df [(df ["type_symbol" ] == "H" )]
337
367
)
338
368
339
369
@staticmethod
340
370
def _get_heavy (df , invert ):
341
371
"""Return only heavy atom entries from a DataFrame"""
342
- return df [df ["type_symbol" ] == "H" ] if invert else df [df ["type_symbol" ] != "H" ]
372
+ return (
373
+ df [df ["type_symbol" ] == "H" ]
374
+ if invert
375
+ else df [df ["type_symbol" ] != "H" ]
376
+ )
343
377
344
378
@staticmethod
345
379
def _get_calpha (df , invert , atom_col : str = "auth_atom_id" ):
@@ -349,7 +383,11 @@ def _get_calpha(df, invert, atom_col: str = "auth_atom_id"):
349
383
@staticmethod
350
384
def _get_carbon (df , invert ):
351
385
"""Return carbon atom entries from a DataFrame"""
352
- return df [df ["type_symbol" ] != "C" ] if invert else df [df ["type_symbol" ] == "C" ]
386
+ return (
387
+ df [df ["type_symbol" ] != "C" ]
388
+ if invert
389
+ else df [df ["type_symbol" ] == "C" ]
390
+ )
353
391
354
392
def amino3to1 (
355
393
self ,
@@ -400,8 +438,9 @@ def amino3to1(
400
438
indices .append (ind )
401
439
cmp = num
402
440
403
- transl = tmp .iloc [indices ][residue_col ].map (
404
- amino3to1dict ).fillna (fillna )
441
+ transl = (
442
+ tmp .iloc [indices ][residue_col ].map (amino3to1dict ).fillna (fillna )
443
+ )
405
444
406
445
return pd .concat ((tmp .iloc [indices ][chain_col ], transl ), axis = 1 )
407
446
@@ -486,7 +525,9 @@ def distance(self, xyz=(0.00, 0.00, 0.00), records=("ATOM", "HETATM")):
486
525
487
526
return np .sqrt (
488
527
np .sum (
489
- df [["Cartn_x" , "Cartn_y" , "Cartn_z" ]].subtract (xyz , axis = 1 ) ** 2 , axis = 1
528
+ df [["Cartn_x" , "Cartn_y" , "Cartn_z" ]].subtract (xyz , axis = 1 )
529
+ ** 2 ,
530
+ axis = 1 ,
490
531
)
491
532
)
492
533
@@ -512,7 +553,9 @@ def distance_df(df, xyz=(0.00, 0.00, 0.00)):
512
553
"""
513
554
return np .sqrt (
514
555
np .sum (
515
- df [["Cartn_x" , "Cartn_y" , "Cartn_z" ]].subtract (xyz , axis = 1 ) ** 2 , axis = 1
556
+ df [["Cartn_x" , "Cartn_y" , "Cartn_z" ]].subtract (xyz , axis = 1 )
557
+ ** 2 ,
558
+ axis = 1 ,
516
559
)
517
560
)
518
561
@@ -546,7 +589,11 @@ def read_mmcif_from_list(self, mmcif_lines):
546
589
self .code = self .data ["entry" ]["id" ][0 ].lower ()
547
590
return self
548
591
549
- def convert_to_pandas_pdb (self , offset_chains : bool = True , records : List [str ] = ["ATOM" , "HETATM" ]) -> PandasPdb :
592
+ def convert_to_pandas_pdb (
593
+ self ,
594
+ offset_chains : bool = True ,
595
+ records : List [str ] = ["ATOM" , "HETATM" ],
596
+ ) -> PandasPdb :
550
597
"""Returns a PandasPdb object with the same data as the PandasMmcif
551
598
object.
552
599
@@ -586,10 +633,15 @@ def convert_to_pandas_pdb(self, offset_chains: bool = True, records: List[str] =
586
633
587
634
# Update atom numbers
588
635
if offset_chains :
589
- offsets = pandaspdb .df ["ATOM" ]["chain_id" ].astype (
590
- "category" ).cat .codes
591
- pandaspdb .df ["ATOM" ]["atom_number" ] = pandaspdb .df ["ATOM" ]["atom_number" ] + offsets
636
+ offsets = (
637
+ pandaspdb .df ["ATOM" ]["chain_id" ].astype ("category" ).cat .codes
638
+ )
639
+ pandaspdb .df ["ATOM" ]["atom_number" ] = (
640
+ pandaspdb .df ["ATOM" ]["atom_number" ] + offsets
641
+ )
592
642
hetatom_offset = offsets .max () + 1
593
- pandaspdb .df ["HETATM" ]["atom_number" ] = pandaspdb .df ["HETATM" ]["atom_number" ] + hetatom_offset
643
+ pandaspdb .df ["HETATM" ]["atom_number" ] = (
644
+ pandaspdb .df ["HETATM" ]["atom_number" ] + hetatom_offset
645
+ )
594
646
595
647
return pandaspdb
0 commit comments