Skip to content

Commit 5426012

Browse files
committed
Feature: Start investigating noun phrase features
1 parent 5126ee1 commit 5426012

File tree

3 files changed

+268
-24
lines changed

3 files changed

+268
-24
lines changed

ingredient_parser/dataclasses.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,25 @@
1414
from ._common import UREG
1515

1616

17+
@dataclass
18+
class TokenFeatures:
19+
stem: str
20+
shape: str
21+
is_capitalised: bool
22+
is_unit: bool
23+
is_punc: bool
24+
is_ambiguous_unit: bool
25+
26+
27+
@dataclass
28+
class Token:
29+
index: int
30+
text: str
31+
feat_text: str
32+
pos_tag: str
33+
features: TokenFeatures
34+
35+
1736
@dataclass
1837
class IngredientAmount:
1938
"""Dataclass for holding a parsed ingredient amount.

ingredient_parser/en/_phrases.py

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
#!/usr/bin/env python3
2+
3+
import nltk
4+
5+
from ..dataclasses import Token
6+
from ._constants import FLATTENED_UNITS_LIST
7+
8+
9+
class MIP:
10+
"""
11+
Multi-ingredient Phrases.
12+
13+
This class handles the detection of multi-ingredient phrases in an ingredient
14+
sentence, and the generation of features for tokens within the multi-ingredient
15+
phrase.
16+
17+
A multi-ingredient phrase is a phrase within an ingredient sentence that states a
18+
list of alternative ingredients for a give amount. For example
19+
* 2 tbsp butter or olive oil
20+
^^^^^^^^^^^^^^^^^^^
21+
* 1 cup vegetable, olive or sunflower oil
22+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
23+
"""
24+
25+
mip_parser = nltk.RegexpParser(
26+
r"""
27+
# Extended multi-ingredient phrase containing of 3 ingredients
28+
EMIP: {<NN.*|JJ.*>+<,><NN.*|JJ.*>+<CC><NN.*|JJ.*>*<NN.*>}
29+
# Multi-ingredient phrase containing of 2 ingredients
30+
MIP: {<NN.*|JJ.*>+<CC><NN.*|JJ.*>*<NN.*>}
31+
"""
32+
)
33+
34+
def __init__(self, tokenized_sentence: list[Token]):
35+
self.tokenized_sentence = tokenized_sentence
36+
self.phrases: list[list[int]] = self.detect_phrases(tokenized_sentence)
37+
38+
def _get_subtree_indices(
39+
self, parent_tree: nltk.Tree, subtree: nltk.Tree
40+
) -> list[int]:
41+
"""Get the indices of a subtree in the parent tree.
42+
43+
Parameters
44+
----------
45+
parent_tree : nltk.Tree
46+
Parent tree to find indices of subtree within.
47+
subtree : nltk.Tree
48+
Subtree to find within parent tree.
49+
50+
Returns
51+
-------
52+
list[int]
53+
List of indices of subtree in parent tree.
54+
If not found, return empty list.
55+
"""
56+
parent_leaves = parent_tree.leaves()
57+
subtree_leaves = subtree.leaves()
58+
59+
subtree_len = len(subtree_leaves)
60+
for i in range(len(parent_leaves) - subtree_len + 1):
61+
if (
62+
parent_leaves[i] == subtree_leaves[0]
63+
and parent_leaves[i : i + subtree_len] == subtree_leaves
64+
):
65+
return list(range(i, i + subtree_len))
66+
67+
return []
68+
69+
def _cc_is_not_or(
70+
self, text_pos: list[tuple[str, str]], indices: list[int]
71+
) -> bool:
72+
"""Return True if conjunction in phrase is not "or".
73+
74+
Parameters
75+
----------
76+
text_pos : list[tuple[str, str]]
77+
List of (text, pos) tuples.
78+
indices : list[int]
79+
Indices of tokens in phrase.
80+
81+
Returns
82+
-------
83+
bool
84+
True if phrase conjunction is not "or".
85+
"""
86+
text = [text_pos[i][0] for i in indices]
87+
pos = [text_pos[i][1] for i in indices]
88+
try:
89+
cc_index = pos.index("CC")
90+
if text[cc_index].lower() != "or":
91+
return True
92+
return False
93+
except ValueError:
94+
return False
95+
96+
def detect_phrases(self, tokenized_sentence: list[Token]) -> list[list[int]]:
97+
"""Detect multi-ingredient phrases in tokenized sentence.
98+
99+
Parameters
100+
----------
101+
tokenized_sentence : list[Token]
102+
Tokenized sentence to detect phrases within.
103+
104+
Returns
105+
-------
106+
list[list[int]]
107+
List of phrases. Each phrase is specified by the indices of the tokens in
108+
the tokenized sentence.
109+
"""
110+
phrases = []
111+
112+
text_pos = [(token.text, token.pos_tag) for token in self.tokenized_sentence]
113+
parsed = self.mip_parser.parse(text_pos)
114+
for subtree in parsed.subtrees(filter=lambda t: t.label() in ["EMIP", "MIP"]):
115+
indices = self._get_subtree_indices(parsed, subtree)
116+
# If the conjunction is not "or", skip
117+
if self._cc_is_not_or(text_pos, indices):
118+
continue
119+
120+
# If first item in list is a known unit, remove it.
121+
first_idx = indices[0]
122+
if self.tokenized_sentence[first_idx].text.lower() in FLATTENED_UNITS_LIST:
123+
indices = indices[1:]
124+
# If first index is now a conjunction, skip.
125+
if self.tokenized_sentence[indices[0]].pos_tag == "CC":
126+
continue
127+
128+
phrases.append(indices)
129+
130+
return phrases
131+
132+
def token_features(self, index: int, prefix: str) -> dict[str, str | bool]:
133+
"""Return dict of features for token at index.
134+
135+
Features:
136+
"mip": True if index in phrase.
137+
"cc_distance": Distance between index and conjunction in phrase.
138+
"mip_start": True if index at start of multi-ingredient phrase.
139+
"mip_end": True if index at end of multi-ingredient phrase.
140+
141+
Parameters
142+
----------
143+
index : int
144+
Index of token to return features for.
145+
prefix : str
146+
Feature label prefix.
147+
148+
Returns
149+
-------
150+
dict[str, str | bool]
151+
Dict of features.
152+
If index is not in phrase, return empty dict.
153+
"""
154+
features = {}
155+
for phrase in self.phrases:
156+
if index not in phrase:
157+
continue
158+
159+
# features[prefix + "mip"] = True
160+
# features[prefix + "cc_distance"] = str(
161+
# self._get_distance_from_cc(phrase, index)
162+
# )
163+
164+
# if index == phrase[0]:
165+
# features[prefix + "mip_start"] = True
166+
# if index == phrase[-1]:
167+
# features[prefix + "mip_end"] = True
168+
169+
if self._candidate_name_mod(phrase, index):
170+
features["name_mod_candidate"] = True
171+
172+
return features
173+
174+
def _get_distance_from_cc(self, phrase: list[int], index: int) -> int:
175+
"""Calculate distance of index from index of conjunction ("CC") in phrase.
176+
177+
Parameters
178+
----------
179+
phrase : list[int]
180+
Indices of phrase tokens.
181+
index : int
182+
Index to calculate distance for.
183+
184+
Returns
185+
-------
186+
int
187+
Distance from conjunction.
188+
If index occurs before conjunction, this value is negative.
189+
"""
190+
phrase_pos_tags = [self.tokenized_sentence[i].pos_tag for i in phrase]
191+
cc_index = phrase_pos_tags.index("CC") + phrase[0]
192+
return index - cc_index
193+
194+
def _candidate_name_mod(self, phrase: list[int], index: int) -> bool:
195+
split_phrase_tokens = list(self._split_phrase(self.tokenized_sentence, phrase))
196+
if len(split_phrase_tokens[0]) > 1:
197+
return split_phrase_tokens[0][0].index == index
198+
199+
return False
200+
201+
def _split_phrase(self, tokenized_sentence: list[Token], phrase: list[int]):
202+
"""Yield lists of items from *iterable*, where each list is delimited by
203+
an item where callable *pred* returns ``True``.
204+
205+
>>> list(split_at("abcdcba", lambda x: x == "b"))
206+
[['a'], ['c', 'd', 'c'], ['a']]
207+
208+
>>> list(split_at(range(10), lambda n: n % 2 == 1))
209+
[[0], [2], [4], [6], [8], []]
210+
211+
At most *maxsplit* splits are done. If *maxsplit* is not specified or -1,
212+
then there is no limit on the number of splits:
213+
214+
>>> list(split_at(range(10), lambda n: n % 2 == 1, maxsplit=2))
215+
[[0], [2], [4, 5, 6, 7, 8, 9]]
216+
217+
By default, the delimiting items are not included in the output.
218+
To include them, set *keep_separator* to ``True``.
219+
220+
>>> list(split_at("abcdcba", lambda x: x == "b", keep_separator=True))
221+
[['a'], ['b'], ['c', 'd', 'c'], ['b'], ['a']]
222+
223+
"""
224+
phrase_tokens = [tokenized_sentence[i] for i in phrase]
225+
226+
buf = []
227+
# it = iter(tokenized_sentence)
228+
for token in phrase_tokens:
229+
if token.text == "," or token.pos_tag == "CC":
230+
yield buf
231+
yield [token]
232+
buf = []
233+
else:
234+
buf.append(token)
235+
yield buf

0 commit comments

Comments
 (0)