Skip to content

Commit d660dc9

Browse files
committed
Merge pull request 'noun-phrase-features' (#221) from noun-phrase-features into develop
2 parents 94f34d2 + 4ce0507 commit d660dc9

File tree

5 files changed

+354
-39
lines changed

5 files changed

+354
-39
lines changed

ingredient_parser/dataclasses.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,25 @@
1414
from ._common import UREG
1515

1616

17+
@dataclass
18+
class TokenFeatures:
19+
stem: str
20+
shape: str
21+
is_capitalised: bool
22+
is_unit: bool
23+
is_punc: bool
24+
is_ambiguous_unit: bool
25+
26+
27+
@dataclass
28+
class Token:
29+
index: int
30+
text: str
31+
feat_text: str
32+
pos_tag: str
33+
features: TokenFeatures
34+
35+
1736
@dataclass
1837
class IngredientAmount:
1938
"""Dataclass for holding a parsed ingredient amount.

ingredient_parser/en/_constants.py

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -123,24 +123,6 @@
123123
# since we need this in a few places
124124
FLATTENED_UNITS_LIST = set(chain.from_iterable(UNITS.items()))
125125

126-
# Words that can modify a unit
127-
UNIT_MODIFIERS = [
128-
"big",
129-
"fat",
130-
"generous",
131-
"healthy",
132-
"heaped",
133-
"heaping",
134-
"large",
135-
"medium",
136-
"medium-size",
137-
"medium-sized",
138-
"scant",
139-
"small",
140-
"thick",
141-
"thin",
142-
]
143-
144126
# Units that can be part of the name
145127
# e.g. 1 teaspoon ground cloves, or 5 bay leaves
146128
AMBIGUOUS_UNITS = [
@@ -163,6 +145,36 @@
163145

164146
AMBIGUOUS_UNITS.extend(_ambiguous_units_alt_forms)
165147

148+
# Words that indicate ingredient size
149+
SIZES = [
150+
"big",
151+
"bite-size",
152+
"bite-sized",
153+
"extra-large",
154+
"jumbo",
155+
"large",
156+
"lg",
157+
"little",
158+
"md",
159+
"medium",
160+
"medium-large",
161+
"medium-size",
162+
"medium-sized",
163+
"medium-small",
164+
"medium-to-large",
165+
"miniature",
166+
"regular",
167+
"slim",
168+
"sm",
169+
"small",
170+
"small-to-medium",
171+
"smaller",
172+
"smallest",
173+
"thick",
174+
"thin",
175+
"tiny",
176+
]
177+
166178

167179
# Strings and their numeric representation
168180
STRING_NUMBERS = {

ingredient_parser/en/_phrases.py

Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
#!/usr/bin/env python3
2+
3+
import nltk
4+
5+
from ..dataclasses import Token
6+
from ._constants import FLATTENED_UNITS_LIST, SIZES
7+
8+
9+
class MIP:
10+
"""
11+
Multi-ingredient Phrases.
12+
13+
This class handles the detection of multi-ingredient phrases in an ingredient
14+
sentence, and the generation of features for tokens within the multi-ingredient
15+
phrase.
16+
17+
A multi-ingredient phrase is a phrase within an ingredient sentence that states a
18+
list of alternative ingredients for a give amount. For example
19+
* 2 tbsp butter or olive oil
20+
^^^^^^^^^^^^^^^^^^^
21+
* 1 cup vegetable, olive or sunflower oil
22+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
23+
"""
24+
25+
mip_parser = nltk.RegexpParser(
26+
r"""
27+
# Extended multi-ingredient phrase containing of 3 ingredients
28+
EMIP: {<NN.*|JJ.*>+<,><NN.*|JJ.*>+<CC><NN.*|JJ.*>*<NN.*>}
29+
# Multi-ingredient phrase containing of 2 ingredients
30+
MIP: {<NN.*|JJ.*>+<CC><NN.*|JJ.*>*<NN.*>}
31+
"""
32+
)
33+
34+
def __init__(self, tokenized_sentence: list[Token]):
35+
self.tokenized_sentence = tokenized_sentence
36+
self.phrases: list[list[int]] = self.detect_phrases(tokenized_sentence)
37+
38+
def _get_subtree_indices(
39+
self, parent_tree: nltk.Tree, subtree: nltk.Tree
40+
) -> list[int]:
41+
"""Get the indices of a subtree in the parent tree.
42+
43+
Parameters
44+
----------
45+
parent_tree : nltk.Tree
46+
Parent tree to find indices of subtree within.
47+
subtree : nltk.Tree
48+
Subtree to find within parent tree.
49+
50+
Returns
51+
-------
52+
list[int]
53+
List of indices of subtree in parent tree.
54+
If not found, return empty list.
55+
"""
56+
parent_leaves = parent_tree.leaves()
57+
subtree_leaves = subtree.leaves()
58+
59+
subtree_len = len(subtree_leaves)
60+
for i in range(len(parent_leaves) - subtree_len + 1):
61+
if (
62+
parent_leaves[i] == subtree_leaves[0]
63+
and parent_leaves[i : i + subtree_len] == subtree_leaves
64+
):
65+
return list(range(i, i + subtree_len))
66+
67+
return []
68+
69+
def _cc_is_not_or(
70+
self, text_pos: list[tuple[str, str]], indices: list[int]
71+
) -> bool:
72+
"""Return True if conjunction in phrase is not "or".
73+
74+
Parameters
75+
----------
76+
text_pos : list[tuple[str, str]]
77+
List of (text, pos) tuples.
78+
indices : list[int]
79+
Indices of tokens in phrase.
80+
81+
Returns
82+
-------
83+
bool
84+
True if phrase conjunction is not "or".
85+
"""
86+
text = [text_pos[i][0] for i in indices]
87+
pos = [text_pos[i][1] for i in indices]
88+
try:
89+
cc_index = pos.index("CC")
90+
if text[cc_index].lower() != "or":
91+
return True
92+
return False
93+
except ValueError:
94+
return False
95+
96+
def detect_phrases(self, tokenized_sentence: list[Token]) -> list[list[int]]:
97+
"""Detect multi-ingredient phrases in tokenized sentence.
98+
99+
Parameters
100+
----------
101+
tokenized_sentence : list[Token]
102+
Tokenized sentence to detect phrases within.
103+
104+
Returns
105+
-------
106+
list[list[int]]
107+
List of phrases. Each phrase is specified by the indices of the tokens in
108+
the tokenized sentence.
109+
"""
110+
phrases = []
111+
112+
text_pos = [(token.text, token.pos_tag) for token in self.tokenized_sentence]
113+
parsed = self.mip_parser.parse(text_pos)
114+
for subtree in parsed.subtrees(filter=lambda t: t.label() in ["EMIP", "MIP"]):
115+
indices = self._get_subtree_indices(parsed, subtree)
116+
# If the conjunction is not "or", skip
117+
if self._cc_is_not_or(text_pos, indices):
118+
continue
119+
120+
# Remove any units or sizes from the beginning of the phrase
121+
first_idx = indices[0]
122+
tokens_to_discard = [*FLATTENED_UNITS_LIST, *SIZES]
123+
while self.tokenized_sentence[first_idx].text.lower() in tokens_to_discard:
124+
indices = indices[1:]
125+
first_idx = indices[0]
126+
127+
# If phrase is empty, skip.
128+
if not indices:
129+
continue
130+
131+
# If first index is now a conjunction, skip.
132+
if self.tokenized_sentence[indices[0]].pos_tag == "CC" or not indices:
133+
continue
134+
135+
phrases.append(indices)
136+
137+
return phrases
138+
139+
def token_features(self, index: int, prefix: str) -> dict[str, str | bool]:
140+
"""Return dict of features for token at index.
141+
142+
Features:
143+
"mip_start": True if index at start of multi-ingredient phrase.
144+
"mip_end": True if index at end of multi-ingredient phrase.
145+
146+
Parameters
147+
----------
148+
index : int
149+
Index of token to return features for.
150+
prefix : str
151+
Feature label prefix.
152+
153+
Returns
154+
-------
155+
dict[str, str | bool]
156+
Dict of features.
157+
If index is not in phrase, return empty dict.
158+
"""
159+
features = {}
160+
for phrase in self.phrases:
161+
if index not in phrase:
162+
continue
163+
164+
if index == phrase[0]:
165+
features[prefix + "mip_start"] = True
166+
167+
if index == phrase[-1]:
168+
features[prefix + "mip_end"] = True
169+
170+
return features
171+
172+
def _candidate_name_mod(self, phrase: list[int], index: int) -> bool:
173+
"""Return True if token at index in phrase is candidate for NAME_MOD label.
174+
175+
A token is a candidate for NAME_MOD if it is the first element of the phrase.
176+
177+
Parameters
178+
----------
179+
phrase : list[int]
180+
List of token indices for phrase.
181+
index : int
182+
Index of token to consider.
183+
184+
Returns
185+
-------
186+
bool
187+
True, if token is first in phrase.
188+
"""
189+
split_phrase_tokens = list(self._split_phrase(self.tokenized_sentence, phrase))
190+
if len(split_phrase_tokens[0]) > 1:
191+
return split_phrase_tokens[0][0].index == index
192+
193+
return False
194+
195+
def _split_phrase(self, tokenized_sentence: list[Token], phrase: list[int]):
196+
"""Yield lists of items from *iterable*, where each list is delimited by
197+
an item where callable *pred* returns ``True``.
198+
199+
>>> list(split_at("abcdcba", lambda x: x == "b"))
200+
[['a'], ['c', 'd', 'c'], ['a']]
201+
202+
>>> list(split_at(range(10), lambda n: n % 2 == 1))
203+
[[0], [2], [4], [6], [8], []]
204+
205+
At most *maxsplit* splits are done. If *maxsplit* is not specified or -1,
206+
then there is no limit on the number of splits:
207+
208+
>>> list(split_at(range(10), lambda n: n % 2 == 1, maxsplit=2))
209+
[[0], [2], [4, 5, 6, 7, 8, 9]]
210+
211+
By default, the delimiting items are not included in the output.
212+
To include them, set *keep_separator* to ``True``.
213+
214+
>>> list(split_at("abcdcba", lambda x: x == "b", keep_separator=True))
215+
[['a'], ['b'], ['c', 'd', 'c'], ['b'], ['a']]
216+
217+
"""
218+
phrase_tokens = [tokenized_sentence[i] for i in phrase]
219+
220+
buf = []
221+
# it = iter(tokenized_sentence)
222+
for token in phrase_tokens:
223+
if token.text == "," or token.pos_tag == "CC":
224+
yield buf
225+
yield [token]
226+
buf = []
227+
else:
228+
buf.append(token)
229+
yield buf

0 commit comments

Comments
 (0)