Skip to content

Commit 34713c1

Browse files
committed
Merge branch 'develop' into label-constraints
2 parents 7b92034 + d2ff5c0 commit 34713c1

File tree

7 files changed

+442
-39
lines changed

7 files changed

+442
-39
lines changed

ingredient_parser/dataclasses.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,25 @@
1414
from ._common import UREG
1515

1616

17+
@dataclass
18+
class TokenFeatures:
19+
stem: str
20+
shape: str
21+
is_capitalised: bool
22+
is_unit: bool
23+
is_punc: bool
24+
is_ambiguous_unit: bool
25+
26+
27+
@dataclass
28+
class Token:
29+
index: int
30+
text: str
31+
feat_text: str
32+
pos_tag: str
33+
features: TokenFeatures
34+
35+
1736
@dataclass
1837
class IngredientAmount:
1938
"""Dataclass for holding a parsed ingredient amount.

ingredient_parser/en/_constants.py

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -123,24 +123,6 @@
123123
# since we need this in a few places
124124
FLATTENED_UNITS_LIST = set(chain.from_iterable(UNITS.items()))
125125

126-
# Words that can modify a unit
127-
UNIT_MODIFIERS = [
128-
"big",
129-
"fat",
130-
"generous",
131-
"healthy",
132-
"heaped",
133-
"heaping",
134-
"large",
135-
"medium",
136-
"medium-size",
137-
"medium-sized",
138-
"scant",
139-
"small",
140-
"thick",
141-
"thin",
142-
]
143-
144126
# Units that can be part of the name
145127
# e.g. 1 teaspoon ground cloves, or 5 bay leaves
146128
AMBIGUOUS_UNITS = [
@@ -163,6 +145,36 @@
163145

164146
AMBIGUOUS_UNITS.extend(_ambiguous_units_alt_forms)
165147

148+
# Words that indicate ingredient size
149+
SIZES = [
150+
"big",
151+
"bite-size",
152+
"bite-sized",
153+
"extra-large",
154+
"jumbo",
155+
"large",
156+
"lg",
157+
"little",
158+
"md",
159+
"medium",
160+
"medium-large",
161+
"medium-size",
162+
"medium-sized",
163+
"medium-small",
164+
"medium-to-large",
165+
"miniature",
166+
"regular",
167+
"slim",
168+
"sm",
169+
"small",
170+
"small-to-medium",
171+
"smaller",
172+
"smallest",
173+
"thick",
174+
"thin",
175+
"tiny",
176+
]
177+
166178

167179
# Strings and their numeric representation
168180
STRING_NUMBERS = {
Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
#!/usr/bin/env python3
2+
3+
import nltk
4+
5+
from ..dataclasses import Token
6+
from ._constants import FLATTENED_UNITS_LIST, SIZES
7+
8+
9+
class SentenceStrucureFeatures:
10+
"""
11+
Sentence structure features.
12+
13+
This class handles the detection and feature generation related to the structure of
14+
the ingredient sentence.
15+
* Multi-ingredient phrases
16+
A multi-ingredient phrase is a phrase within an ingredient sentence that states
17+
a list of alternative ingredients for a give amount. For example
18+
* 2 tbsp butter or olive oil
19+
^^^^^^^^^^^^^^^^^^^
20+
* 1 cup vegetable, olive or sunflower oil
21+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
22+
23+
* Compound sentences containing multiple ingredients of different amounts
24+
A compound sentence is a sentence that includes more than one subject. For example
25+
* 1 tablespoon chopped fresh sage or 1 teaspoon dried sage
26+
^^^^^^^^^^^^^^^^^^^^^^^^
27+
"""
28+
29+
mip_parser = nltk.RegexpParser(
30+
r"""
31+
# Extended multi-ingredient phrase containing of 3 ingredients
32+
EMIP: {<NN.*|JJ.*>+<,><NN.*|JJ.*>+<CC><NN.*|JJ.*>*<NN.*>}
33+
# Multi-ingredient phrase containing of 2 ingredients
34+
MIP: {<NN.*|JJ.*>+<CC><NN.*|JJ.*>*<NN.*>}
35+
"""
36+
)
37+
38+
# RegexpParser to detect the start of the subject phrase.
39+
# UNIT and SIZE are custom tags, based on the FLATTENED_UNITS_LIST and SIZES
40+
# constants.
41+
compound_parser = nltk.RegexpParser(r"CS: {<CC><CD>+<NN.*|UNIT|SIZE>}")
42+
43+
def __init__(self, tokenized_sentence: list[Token]):
44+
self.tokenized_sentence = tokenized_sentence
45+
self.phrases = self.detect_phrases(tokenized_sentence)
46+
self.sentence_splits = self.detect_sentences_splits(tokenized_sentence)
47+
48+
def _get_subtree_indices(
49+
self, parent_tree: nltk.Tree, subtree: nltk.Tree
50+
) -> list[int]:
51+
"""Get the indices of a subtree in the parent tree.
52+
53+
Parameters
54+
----------
55+
parent_tree : nltk.Tree
56+
Parent tree to find indices of subtree within.
57+
subtree : nltk.Tree
58+
Subtree to find within parent tree.
59+
60+
Returns
61+
-------
62+
list[int]
63+
List of indices of subtree in parent tree.
64+
If not found, return empty list.
65+
"""
66+
parent_leaves = parent_tree.leaves()
67+
subtree_leaves = subtree.leaves()
68+
69+
subtree_len = len(subtree_leaves)
70+
for i in range(len(parent_leaves) - subtree_len + 1):
71+
if (
72+
parent_leaves[i] == subtree_leaves[0]
73+
and parent_leaves[i : i + subtree_len] == subtree_leaves
74+
):
75+
return list(range(i, i + subtree_len))
76+
77+
return []
78+
79+
def _cc_is_not_or(
80+
self, text_pos: list[tuple[str, str]], indices: list[int]
81+
) -> bool:
82+
"""Return True if conjunction in phrase is not "or".
83+
84+
Parameters
85+
----------
86+
text_pos : list[tuple[str, str]]
87+
List of (text, pos) tuples.
88+
indices : list[int]
89+
Indices of tokens in phrase.
90+
91+
Returns
92+
-------
93+
bool
94+
True if phrase conjunction is not "or".
95+
"""
96+
text = [text_pos[i][0] for i in indices]
97+
pos = [text_pos[i][1] for i in indices]
98+
try:
99+
cc_index = pos.index("CC")
100+
if text[cc_index].lower() != "or":
101+
return True
102+
return False
103+
except ValueError:
104+
return False
105+
106+
def detect_phrases(self, tokenized_sentence: list[Token]) -> list[list[int]]:
107+
"""Detect multi-ingredient phrases in tokenized sentence.
108+
109+
Parameters
110+
----------
111+
tokenized_sentence : list[Token]
112+
Tokenized sentence to detect phrases within.
113+
114+
Returns
115+
-------
116+
list[list[int]]
117+
List of phrases. Each phrase is specified by the indices of the tokens in
118+
the tokenized sentence.
119+
"""
120+
phrases = []
121+
122+
text_pos = [(token.text, token.pos_tag) for token in self.tokenized_sentence]
123+
parsed = self.mip_parser.parse(text_pos)
124+
for subtree in parsed.subtrees(filter=lambda t: t.label() in ["EMIP", "MIP"]):
125+
indices = self._get_subtree_indices(parsed, subtree)
126+
# If the conjunction is not "or", skip
127+
if self._cc_is_not_or(text_pos, indices):
128+
continue
129+
130+
# Remove any units or sizes from the beginning of the phrase
131+
first_idx = indices[0]
132+
tokens_to_discard = [*FLATTENED_UNITS_LIST, *SIZES]
133+
while self.tokenized_sentence[first_idx].text.lower() in tokens_to_discard:
134+
indices = indices[1:]
135+
first_idx = indices[0]
136+
137+
# If phrase is empty, skip.
138+
if not indices:
139+
continue
140+
141+
# If first index is now a conjunction, skip.
142+
if self.tokenized_sentence[indices[0]].pos_tag == "CC" or not indices:
143+
continue
144+
145+
phrases.append(indices)
146+
147+
return phrases
148+
149+
def detect_sentences_splits(self, tokenized_sentence: list[Token]) -> list[int]:
150+
"""Return indices of tokens that mark a split in sentence subject.
151+
152+
Parameters
153+
----------
154+
tokenized_sentence : list[Token]
155+
Tokenized sentence to detect phrases within.
156+
157+
Returns
158+
-------
159+
list[int]
160+
List of indices.
161+
"""
162+
split_indices = []
163+
164+
text_pos = []
165+
for t in tokenized_sentence:
166+
if t.text.lower() in FLATTENED_UNITS_LIST:
167+
pos = "UNIT"
168+
elif t.text.lower() in SIZES:
169+
pos = "SIZE"
170+
else:
171+
pos = t.pos_tag
172+
173+
text_pos.append((t.feat_text, pos))
174+
parsed = self.compound_parser.parse(text_pos)
175+
for subtree in parsed.subtrees(filter=lambda t: t.label() == "CS"):
176+
indices = self._get_subtree_indices(parsed, subtree)
177+
# If the conjunction is not "or", skip
178+
if self._cc_is_not_or(text_pos, indices):
179+
continue
180+
181+
split_indices.append(indices[0])
182+
183+
return split_indices
184+
185+
def token_features(self, index: int, prefix: str) -> dict[str, bool]:
186+
"""Return dict of features for token at index.
187+
188+
Features:
189+
"mip_start": True if index at start of multi-ingredient phrase.
190+
"mip_end": True if index at end of multi-ingredient phrase.
191+
192+
Parameters
193+
----------
194+
index : int
195+
Index of token to return features for.
196+
prefix : str
197+
Feature label prefix.
198+
199+
Returns
200+
-------
201+
dict[str, bool]
202+
Dict of features.
203+
"""
204+
features = {
205+
prefix + "mip_start": False,
206+
prefix + "mip_end": False,
207+
prefix + "after_sentence_split": False,
208+
}
209+
for phrase in self.phrases:
210+
if index not in phrase:
211+
continue
212+
213+
if index == phrase[0]:
214+
features[prefix + "mip_start"] = True
215+
216+
if index == phrase[-1]:
217+
features[prefix + "mip_end"] = True
218+
219+
for split_index in self.sentence_splits:
220+
if index >= split_index:
221+
features[prefix + "after_sentence_split"] = True
222+
223+
return features

0 commit comments

Comments
 (0)