Skip to content

Commit 9e9a7c4

Browse files
authored
Merge branch 'main' into feat/mmcif_model_function
2 parents ee32e4e + 67aa2f2 commit 9e9a7c4

36 files changed

+1783
-505
lines changed

.appveyor.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ install:
1414
- conda config --set always_yes yes --set changeps1 no
1515
- conda update -q conda
1616
- conda info -a
17-
- conda create -q -n test-environment --channel=conda-forge mmtf-python numpy scipy pandas nose looseversion python=%PYTHON_VERSION%
17+
- conda create -q -n test-environment --channel=conda-forge mmtf-python numpy scipy pandas pytest looseversion python=%PYTHON_VERSION%
1818
- activate test-environment
1919

2020
test_script:
21-
- nosetests -s -v
21+
- pytest -s -v
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
name: Changelog Enforcer
2+
3+
on: # yamllint disable-line rule:truthy
4+
pull_request:
5+
types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled]
6+
7+
jobs:
8+
9+
changelog:
10+
runs-on: ubuntu-latest
11+
12+
steps:
13+
- uses: actions/checkout@v3
14+
- uses: dangoslen/changelog-enforcer@v3
15+
with:
16+
skipLabels: 'skip-changelog'

biopandas/constants.py

Lines changed: 1039 additions & 174 deletions
Large diffs are not rendered by default.

biopandas/mmcif/mmcif_parser.py

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,28 @@ def __init__(self, parser_obj):
2222
self.names_defined = False
2323

2424
def add_name(self, name):
25-
cat_name = type(name) == str and partition_string(name, ".") or ["", "", ""]
25+
cat_name = (
26+
isinstance(name, str) and partition_string(name, ".") or ["", "", ""]
27+
)
2628
if cat_name[1]:
2729
if cat_name[0] not in self.parser_obj.current_target[-2]:
2830
self.parser_obj.current_target[-2][cat_name[0]] = {}
29-
if cat_name[2] not in self.parser_obj.current_target[-2][cat_name[0]]:
30-
self.parser_obj.current_target[-2][cat_name[0]][cat_name[2]] = []
31+
if (
32+
cat_name[2]
33+
not in self.parser_obj.current_target[-2][cat_name[0]]
34+
):
35+
self.parser_obj.current_target[-2][cat_name[0]][
36+
cat_name[2]
37+
] = []
3138
self.ref_list.append(
3239
self.parser_obj.current_target[-2][cat_name[0]][cat_name[2]]
3340
)
3441
else:
3542
if cat_name[0] not in self.parser_obj.current_target[-2]:
3643
self.parser_obj.current_target[-2][cat_name[0]] = []
37-
self.ref_list.append(self.parser_obj.current_target[-2][cat_name[0]])
44+
self.ref_list.append(
45+
self.parser_obj.current_target[-2][cat_name[0]]
46+
)
3847
self.length = len(self.ref_list)
3948

4049
def push_value(self, value):
@@ -218,16 +227,16 @@ def __repr__(self):
218227
def __cif_float_range__(inp):
219228
try:
220229
pos = inp.index("-", 1)
221-
return (__CIFFloat__(inp[:pos]), __CIFFloat__(inp[pos + 1 :]))
222-
except:
230+
return (__CIFFloat__(inp[:pos]), __CIFFloat__(inp[pos + 1:]))
231+
except Exception:
223232
return (__CIFFloat__(inp),)
224233

225234

226235
def __cif_int_range__(inp):
227236
try:
228237
pos = inp.index("-", 1)
229-
return (__CIFInt__(inp[:pos]), __CIFInt__(inp[pos + 1 :]))
230-
except:
238+
return (__CIFInt__(inp[:pos]), __CIFInt__(inp[pos + 1:]))
239+
except Exception:
231240
return (__CIFInt__(inp),)
232241

233242

@@ -239,12 +248,12 @@ def __load_cif_dic__(dic_file, force=False):
239248
if force:
240249
throw
241250
dic = json.loads(open(jsf).read())
242-
except:
251+
except Exception:
243252
parser = CIFParser()
244253
parser.parse(open(dic_file))
245254
json.dump(parser.data, open(jsf_dic, "w"))
246255
for k, v in parser.data["data_mmcif_pdbx.dic"].items():
247-
if type(v) != dict or "item_type" not in v:
256+
if not isinstance(v, dict) or "item_type" not in v:
248257
continue
249258
name = partition_string(k[6:], ".")
250259
if name[0] not in dic:
@@ -285,11 +294,13 @@ def __dump_cif__(jso):
285294
def __dump_str__(inp):
286295
if inp is None:
287296
return "?"
288-
if type(inp) is not str:
297+
if not isinstance(inp, str):
289298
return str(inp)
290299
if re.search(__CIF_STR_NL_CHECK__, inp) is not None:
291300
return "\n;%s\n;" % inp
292-
return "'%s'" % inp if re.search(__CIF_STR_CHECK__, inp) is not None else inp
301+
return (
302+
"'%s'" % inp if re.search(__CIF_STR_CHECK__, inp) is not None else inp
303+
)
293304

294305

295306
def __pad_string__(inp, flength):
@@ -354,7 +365,7 @@ def __dump_part__(jso):
354365

355366
def load_cif_data(data, do_clean=True, do_type=True):
356367
parser = CIFParser()
357-
if type(data) == str:
368+
if isinstance(data, str):
358369
parser.parse_string(data)
359370
else:
360371
parser.parse(data) # fileobj

biopandas/mmcif/pandas_mmcif.py

Lines changed: 98 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Class for working with MMCIF files."""
2+
23
# BioPandas
34
# Authors: Arian Jamasb <arian@jamasb.io>,
45
# Authors: Sebastian Raschka <mail@sebastianraschka.com>
@@ -130,56 +131,76 @@ def get_models(self, model_indices: List[int]) -> PandasMmcif:
130131
]
131132
return biopandas_structure
132133

133-
def fetch_mmcif(self, pdb_code: Optional[str] = None, uniprot_id: Optional[str] = None, source: str = "pdb"):
134+
def fetch_mmcif(
135+
self,
136+
pdb_code: Optional[str] = None,
137+
uniprot_id: Optional[str] = None,
138+
source: str = "pdb",
139+
):
134140
"""Fetches mmCIF file contents from the Protein Databank at rcsb.org or AlphaFold database at https://alphafold.ebi.ac.uk/.
135-
.
141+
.
136142
137-
Parameters
138-
----------
139-
pdb_code : str, optional
140-
A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the PDB. Defaults to `None`.
143+
Parameters
144+
----------
145+
pdb_code : str, optional
146+
A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the PDB. Defaults to `None`.
141147
142-
uniprot_id : str, optional
143-
A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from the AF2 database. Defaults to `None`.
148+
uniprot_id : str, optional
149+
A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from the AF2 database. Defaults to `None`.
144150
145-
source : str
146-
The source to retrieve the structure from
147-
(`"pdb"`, `"alphafold2-v3"` or `"alphafold2-v4"`). Defaults to `"pdb"`.
151+
source : str
152+
The source to retrieve the structure from
153+
(`"pdb"`, `"alphafold2-v3"` or `"alphafold2-v4"`). Defaults to `"pdb"`.
148154
149-
Returns
150-
---------
151-
self
155+
Returns
156+
---------
157+
self
152158
153159
"""
154160
# Sanitize input
155161
invalid_input_identifier_1 = pdb_code is None and uniprot_id is None
156-
invalid_input_identifier_2 = pdb_code is not None and uniprot_id is not None
157-
invalid_input_combination_1 = uniprot_id is not None and source == "pdb"
162+
invalid_input_identifier_2 = (
163+
pdb_code is not None and uniprot_id is not None
164+
)
165+
invalid_input_combination_1 = (
166+
uniprot_id is not None and source == "pdb"
167+
)
158168
invalid_input_combination_2 = pdb_code is not None and source in {
159-
"alphafold2-v3", "alphafold2-v4"}
169+
"alphafold2-v3",
170+
"alphafold2-v4",
171+
}
160172

161173
if invalid_input_identifier_1 or invalid_input_identifier_2:
162174
raise ValueError(
163-
"Please provide either a PDB code or a UniProt ID.")
175+
"Please provide either a PDB code or a UniProt ID."
176+
)
164177

165178
if invalid_input_combination_1:
166179
raise ValueError(
167-
"Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'.")
180+
"Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'."
181+
)
168182
elif invalid_input_combination_2:
169183
raise ValueError(
170-
f"Please use a 'uniprot_id' instead of 'pdb_code' for source={source}.")
184+
f"Please use a 'uniprot_id' instead of 'pdb_code' for source={source}."
185+
)
171186

172187
if source == "pdb":
173188
self.mmcif_path, self.mmcif_text = self._fetch_mmcif(pdb_code)
174189
elif source == "alphafold2-v3":
175190
af2_version = 3
176-
self.mmcif_path, self.mmcif_text = self._fetch_af2(uniprot_id, af2_version)
191+
self.mmcif_path, self.mmcif_text = self._fetch_af2(
192+
uniprot_id, af2_version
193+
)
177194
elif source == "alphafold2-v4":
178195
af2_version = 4
179-
self.mmcif_path, self.mmcif_text = self._fetch_af2(uniprot_id, af2_version)
196+
self.mmcif_path, self.mmcif_text = self._fetch_af2(
197+
uniprot_id, af2_version
198+
)
180199
else:
181-
raise ValueError(f"Invalid source: {source}."
182-
" Please use one of 'pdb', 'alphafold2-v3' or 'alphafold2-v4'.")
200+
raise ValueError(
201+
f"Invalid source: {source}."
202+
" Please use one of 'pdb', 'alphafold2-v3' or 'alphafold2-v4'."
203+
)
183204

184205
self._df = self._construct_df(text=self.mmcif_text)
185206
return self
@@ -190,7 +211,8 @@ def _construct_df(self, text: str):
190211
self.data = data
191212
df: Dict[str, pd.DataFrame] = {}
192213
full_df = pd.DataFrame.from_dict(
193-
data["atom_site"], orient="index").transpose()
214+
data["atom_site"], orient="index"
215+
).transpose()
194216
full_df = full_df.astype(mmcif_col_types, errors="ignore")
195217
df["ATOM"] = pd.DataFrame(full_df[full_df.group_PDB == "ATOM"])
196218
df["HETATM"] = pd.DataFrame(full_df[full_df.group_PDB == "HETATM"])
@@ -209,8 +231,9 @@ def _fetch_mmcif(pdb_code):
209231
response = urlopen(url)
210232
txt = response.read()
211233
txt = (
212-
txt.decode(
213-
"utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii")
234+
txt.decode("utf-8")
235+
if sys.version_info[0] >= 3
236+
else txt.encode("ascii")
214237
)
215238
except HTTPError as e:
216239
print(f"HTTP Error {e.code}")
@@ -227,11 +250,15 @@ def _fetch_af2(uniprot_id: str, af2_version: int = 3):
227250
try:
228251
response = urlopen(url)
229252
txt = response.read()
230-
txt = txt.decode('utf-8') if sys.version_info[0] >= 3 else txt.encode('ascii')
253+
txt = (
254+
txt.decode("utf-8")
255+
if sys.version_info[0] >= 3
256+
else txt.encode("ascii")
257+
)
231258
except HTTPError as e:
232-
print(f'HTTP Error {e.code}')
259+
print(f"HTTP Error {e.code}")
233260
except URLError as e:
234-
print(f'URL Error {e.args}')
261+
print(f"URL Error {e.args}")
235262
return url, txt
236263

237264
@staticmethod
@@ -245,7 +272,8 @@ def _read_mmcif(path):
245272
openf = gzip.open
246273
else:
247274
allowed_formats = ", ".join(
248-
(".cif", ".cif.gz", ".mmcif", ".mmcif.gz"))
275+
(".cif", ".cif.gz", ".mmcif", ".mmcif.gz")
276+
)
249277
raise ValueError(
250278
f"Wrong file format; allowed file formats are {allowed_formats}"
251279
)
@@ -255,8 +283,9 @@ def _read_mmcif(path):
255283

256284
if path.endswith(".gz"):
257285
txt = (
258-
txt.decode(
259-
"utf-8") if sys.version_info[0] >= 3 else txt.encode("ascii")
286+
txt.decode("utf-8")
287+
if sys.version_info[0] >= 3
288+
else txt.encode("ascii")
260289
)
261290
return path, txt
262291

@@ -332,14 +361,19 @@ def _get_mainchain(
332361
def _get_hydrogen(df, invert):
333362
"""Return only hydrogen atom entries from a DataFrame"""
334363
return (
335-
df[(df["type_symbol"] != "H")] if invert else df[(
336-
df["type_symbol"] == "H")]
364+
df[(df["type_symbol"] != "H")]
365+
if invert
366+
else df[(df["type_symbol"] == "H")]
337367
)
338368

339369
@staticmethod
340370
def _get_heavy(df, invert):
341371
"""Return only heavy atom entries from a DataFrame"""
342-
return df[df["type_symbol"] == "H"] if invert else df[df["type_symbol"] != "H"]
372+
return (
373+
df[df["type_symbol"] == "H"]
374+
if invert
375+
else df[df["type_symbol"] != "H"]
376+
)
343377

344378
@staticmethod
345379
def _get_calpha(df, invert, atom_col: str = "auth_atom_id"):
@@ -349,7 +383,11 @@ def _get_calpha(df, invert, atom_col: str = "auth_atom_id"):
349383
@staticmethod
350384
def _get_carbon(df, invert):
351385
"""Return carbon atom entries from a DataFrame"""
352-
return df[df["type_symbol"] != "C"] if invert else df[df["type_symbol"] == "C"]
386+
return (
387+
df[df["type_symbol"] != "C"]
388+
if invert
389+
else df[df["type_symbol"] == "C"]
390+
)
353391

354392
def amino3to1(
355393
self,
@@ -400,8 +438,9 @@ def amino3to1(
400438
indices.append(ind)
401439
cmp = num
402440

403-
transl = tmp.iloc[indices][residue_col].map(
404-
amino3to1dict).fillna(fillna)
441+
transl = (
442+
tmp.iloc[indices][residue_col].map(amino3to1dict).fillna(fillna)
443+
)
405444

406445
return pd.concat((tmp.iloc[indices][chain_col], transl), axis=1)
407446

@@ -486,7 +525,9 @@ def distance(self, xyz=(0.00, 0.00, 0.00), records=("ATOM", "HETATM")):
486525

487526
return np.sqrt(
488527
np.sum(
489-
df[["Cartn_x", "Cartn_y", "Cartn_z"]].subtract(xyz, axis=1) ** 2, axis=1
528+
df[["Cartn_x", "Cartn_y", "Cartn_z"]].subtract(xyz, axis=1)
529+
** 2,
530+
axis=1,
490531
)
491532
)
492533

@@ -512,7 +553,9 @@ def distance_df(df, xyz=(0.00, 0.00, 0.00)):
512553
"""
513554
return np.sqrt(
514555
np.sum(
515-
df[["Cartn_x", "Cartn_y", "Cartn_z"]].subtract(xyz, axis=1) ** 2, axis=1
556+
df[["Cartn_x", "Cartn_y", "Cartn_z"]].subtract(xyz, axis=1)
557+
** 2,
558+
axis=1,
516559
)
517560
)
518561

@@ -546,7 +589,11 @@ def read_mmcif_from_list(self, mmcif_lines):
546589
self.code = self.data["entry"]["id"][0].lower()
547590
return self
548591

549-
def convert_to_pandas_pdb(self, offset_chains: bool = True, records: List[str] = ["ATOM", "HETATM"]) -> PandasPdb:
592+
def convert_to_pandas_pdb(
593+
self,
594+
offset_chains: bool = True,
595+
records: List[str] = ["ATOM", "HETATM"],
596+
) -> PandasPdb:
550597
"""Returns a PandasPdb object with the same data as the PandasMmcif
551598
object.
552599
@@ -586,10 +633,15 @@ def convert_to_pandas_pdb(self, offset_chains: bool = True, records: List[str] =
586633

587634
# Update atom numbers
588635
if offset_chains:
589-
offsets = pandaspdb.df["ATOM"]["chain_id"].astype(
590-
"category").cat.codes
591-
pandaspdb.df["ATOM"]["atom_number"] = pandaspdb.df["ATOM"]["atom_number"] + offsets
636+
offsets = (
637+
pandaspdb.df["ATOM"]["chain_id"].astype("category").cat.codes
638+
)
639+
pandaspdb.df["ATOM"]["atom_number"] = (
640+
pandaspdb.df["ATOM"]["atom_number"] + offsets
641+
)
592642
hetatom_offset = offsets.max() + 1
593-
pandaspdb.df["HETATM"]["atom_number"] = pandaspdb.df["HETATM"]["atom_number"] + hetatom_offset
643+
pandaspdb.df["HETATM"]["atom_number"] = (
644+
pandaspdb.df["HETATM"]["atom_number"] + hetatom_offset
645+
)
594646

595647
return pandaspdb

0 commit comments

Comments
 (0)