Skip to content

Commit caad170

Browse files
authored
Additional fixes for glycans (#221)
* Addtl glycan fixes * Fix infinite loop
1 parent 3278943 commit caad170

File tree

2 files changed

+21
-11
lines changed

2 files changed

+21
-11
lines changed

chai_lab/data/parsing/glycans.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,26 +45,29 @@ def _glycan_string_to_sugars_and_bonds(
4545
glycan_string: str,
4646
) -> tuple[list[str], list[GlycosidicBond]]:
4747
"""Parses the glycan string to its constituent sugars and bonds."""
48+
glycan_string = glycan_string.strip() # Remove leading/trailing spaces
4849
sugars: list[str] = [] # Tracks all sugars
4950
parent_sugar_idx: list[int] = [] # Tracks the parent sugar for bond formation
5051
bonds: list[GlycosidicBond] = []
5152
open_count, closed_count = 0, 0
52-
for i in range(len(glycan_string)):
53+
i = 0 # We increment unevenly so manually handle
54+
while i < len(glycan_string):
5355
char = glycan_string[i]
54-
if char == " ":
56+
if char == " ": # Space; skip
57+
i += 1
5558
continue
56-
if char == "(":
59+
if char == "(": # Open bracket
60+
i += 1
5761
open_count += 1
5862
continue
59-
if char == ")":
63+
if char == ")": # Close bracket
6064
closed_count += 1
6165
parent_sugar_idx.pop() # Remove
66+
i += 1
6267
continue
68+
# Not a bracket or a space - should be either bond info or CCD
6369
chunk = glycan_string[i : i + 3]
64-
if re.match(r"[0-9A-Z]{3}", chunk): # Match CCD codes (3 char, alphanumeric)
65-
sugars.append(chunk)
66-
parent_sugar_idx.append(len(sugars) - 1) # latest sugar
67-
elif re.match(r"[1-6]{1}-[1-6]{1}", chunk):
70+
if re.match(r"[1-6]{1}-[1-6]{1}", chunk):
6871
s, d = chunk.split("-")
6972
assert parent_sugar_idx
7073
bonds.append(
@@ -75,6 +78,13 @@ def _glycan_string_to_sugars_and_bonds(
7578
dst_atom=int(d),
7679
)
7780
)
81+
i += 3
82+
elif re.match(r"[0-9A-Z]{3}", chunk):
83+
sugars.append(chunk)
84+
parent_sugar_idx.append(len(sugars) - 1) # latest sugar
85+
i += 3
86+
else:
87+
raise ValueError(f"Invalid glycan string: {glycan_string}")
7888
assert open_count == closed_count
7989
return sugars, bonds
8090

tests/test_glycans.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def test_parsing_ccd_codes(ccd_code: str):
1414

1515

1616
def test_complex_parsing():
17-
glycan = "MAN(6-1 FUC)(4-1 MAN(6-1 MAN(6-1 MAN)))"
17+
glycan = "MAN(6-1 FUC)(4-1 MAN(6-1 MAN(6-1 MAN)))".replace(" ", "")
1818
sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
1919
assert len(sugars) == 5
2020

@@ -32,8 +32,8 @@ def test_complex_parsing():
3232

3333

3434
def test_complex_parsing_2():
35-
glycan = (
36-
"MAN(4-1 FUC(4-1 MAN)(6-1 FUC(4-1 MAN)))(6-1 MAN(6-1 MAN(4-1 MAN)(6-1 FUC)))"
35+
glycan = "MAN(4-1 FUC(4-1 MAN)(6-1 FUC(4-1 MAN)))(6-1 MAN(6-1 MAN(4-1 MAN)(6-1 FUC)))".replace(
36+
" ", ""
3737
)
3838
sugars, bonds = _glycan_string_to_sugars_and_bonds(glycan)
3939
assert len(sugars) == 9

0 commit comments

Comments
 (0)