Skip to content

Commit 4ce0507

Browse files
committed
Feature: Remove size tokens from the start of multi-ingredient phrases and add tests for multi-ingredient phrases
1 parent 5975572 commit 4ce0507

File tree

3 files changed

+120
-51
lines changed

3 files changed

+120
-51
lines changed

ingredient_parser/en/_constants.py

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -123,24 +123,6 @@
123123
# since we need this in a few places
124124
FLATTENED_UNITS_LIST = set(chain.from_iterable(UNITS.items()))
125125

126-
# Words that can modify a unit
127-
UNIT_MODIFIERS = [
128-
"big",
129-
"fat",
130-
"generous",
131-
"healthy",
132-
"heaped",
133-
"heaping",
134-
"large",
135-
"medium",
136-
"medium-size",
137-
"medium-sized",
138-
"scant",
139-
"small",
140-
"thick",
141-
"thin",
142-
]
143-
144126
# Units that can be part of the name
145127
# e.g. 1 teaspoon ground cloves, or 5 bay leaves
146128
AMBIGUOUS_UNITS = [
@@ -163,6 +145,36 @@
163145

164146
AMBIGUOUS_UNITS.extend(_ambiguous_units_alt_forms)
165147

148+
# Words that indicate ingredient size
149+
SIZES = [
150+
"big",
151+
"bite-size",
152+
"bite-sized",
153+
"extra-large",
154+
"jumbo",
155+
"large",
156+
"lg",
157+
"little",
158+
"md",
159+
"medium",
160+
"medium-large",
161+
"medium-size",
162+
"medium-sized",
163+
"medium-small",
164+
"medium-to-large",
165+
"miniature",
166+
"regular",
167+
"slim",
168+
"sm",
169+
"small",
170+
"small-to-medium",
171+
"smaller",
172+
"smallest",
173+
"thick",
174+
"thin",
175+
"tiny",
176+
]
177+
166178

167179
# Strings and their numeric representation
168180
STRING_NUMBERS = {

ingredient_parser/en/_phrases.py

Lines changed: 25 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import nltk
44

55
from ..dataclasses import Token
6-
from ._constants import FLATTENED_UNITS_LIST
6+
from ._constants import FLATTENED_UNITS_LIST, SIZES
77

88

99
class MIP:
@@ -117,14 +117,20 @@ def detect_phrases(self, tokenized_sentence: list[Token]) -> list[list[int]]:
117117
if self._cc_is_not_or(text_pos, indices):
118118
continue
119119

120-
# If first item in list is a known unit, remove it.
120+
# Remove any units or sizes from the beginning of the phrase
121121
first_idx = indices[0]
122-
# TODO: also exclude sizes e.g. large, small. Needs a list of them.
123-
if self.tokenized_sentence[first_idx].text.lower() in FLATTENED_UNITS_LIST:
122+
tokens_to_discard = [*FLATTENED_UNITS_LIST, *SIZES]
123+
while self.tokenized_sentence[first_idx].text.lower() in tokens_to_discard:
124124
indices = indices[1:]
125-
# If first index is now a conjunction, skip.
126-
if self.tokenized_sentence[indices[0]].pos_tag == "CC":
127-
continue
125+
first_idx = indices[0]
126+
127+
# If phrase is empty, skip.
128+
if not indices:
129+
continue
130+
131+
# If first index is now a conjunction, skip.
132+
if self.tokenized_sentence[indices[0]].pos_tag == "CC" or not indices:
133+
continue
128134

129135
phrases.append(indices)
130136

@@ -134,8 +140,6 @@ def token_features(self, index: int, prefix: str) -> dict[str, str | bool]:
134140
"""Return dict of features for token at index.
135141
136142
Features:
137-
"mip": True if index in phrase.
138-
"cc_distance": Distance between index and conjunction in phrase.
139143
"mip_start": True if index at start of multi-ingredient phrase.
140144
"mip_end": True if index at end of multi-ingredient phrase.
141145
@@ -157,43 +161,31 @@ def token_features(self, index: int, prefix: str) -> dict[str, str | bool]:
157161
if index not in phrase:
158162
continue
159163

160-
# features[prefix + "mip"] = True
161-
# features[prefix + "cc_distance"] = str(
162-
# self._get_distance_from_cc(phrase, index)
163-
# )
164-
165-
# if index == phrase[0]:
166-
# features[prefix + "mip_start"] = True
167-
# if index == phrase[-1]:
168-
# features[prefix + "mip_end"] = True
164+
if index == phrase[0]:
165+
features[prefix + "mip_start"] = True
169166

170-
if self._candidate_name_mod(phrase, index):
171-
# Token is first element of first subsection of phrase.
172-
features[prefix + "name_mod_candidate"] = True
167+
if index == phrase[-1]:
168+
features[prefix + "mip_end"] = True
173169

174170
return features
175171

176-
def _get_distance_from_cc(self, phrase: list[int], index: int) -> int:
177-
"""Calculate distance of index from index of conjunction ("CC") in phrase.
172+
def _candidate_name_mod(self, phrase: list[int], index: int) -> bool:
173+
"""Return True if token at index in phrase is candidate for NAME_MOD label.
174+
175+
A token is a candidate for NAME_MOD if it is the first element of the phrase.
178176
179177
Parameters
180178
----------
181179
phrase : list[int]
182-
Indices of phrase tokens.
180+
List of token indices for phrase.
183181
index : int
184-
Index to calculate distance for.
182+
Index of token to consider.
185183
186184
Returns
187185
-------
188-
int
189-
Distance from conjunction.
190-
If index occurs before conjunction, this value is negative.
186+
bool
187+
True, if token is first in phrase.
191188
"""
192-
phrase_pos_tags = [self.tokenized_sentence[i].pos_tag for i in phrase]
193-
cc_index = phrase_pos_tags.index("CC") + phrase[0]
194-
return index - cc_index
195-
196-
def _candidate_name_mod(self, phrase: list[int], index: int) -> bool:
197189
split_phrase_tokens = list(self._split_phrase(self.tokenized_sentence, phrase))
198190
if len(split_phrase_tokens[0]) > 1:
199191
return split_phrase_tokens[0][0].index == index
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
from ingredient_parser.en import PreProcessor
2+
3+
4+
class Test_multi_ingredient_phrase_features:
5+
def test_multi_ingredient_phrase_detection(self):
6+
"""
7+
Test that multi ingredient phrase is correctly identified.
8+
"""
9+
p = PreProcessor("2 tbsp chicken or beef stock")
10+
assert p.mip.phrases == [[2, 3, 4, 5]]
11+
12+
def test_multi_ingredient_phrase_detection_with_name_mod(self):
13+
"""
14+
Test that multi ingredient phrase with name modifier is correctly identified.
15+
"""
16+
p = PreProcessor("2 tbsp hot chicken or beef stock")
17+
assert p.mip.phrases == [[2, 3, 4, 5, 6]]
18+
19+
def test_extended_multi_ingredient_phrase_detection(self):
20+
"""
21+
Test that extended multi ingredient phrase is correctly identified.
22+
"""
23+
p = PreProcessor("2 tbsp olive, vegetable or sunflower oil")
24+
assert p.mip.phrases == [[2, 3, 4, 5, 6, 7]]
25+
26+
def test_mip_start_feature_unit(self):
27+
"""
28+
Test that the start of the multi ingredient phrase is correctly identified by
29+
ignoring the units.
30+
"""
31+
p = PreProcessor("2 tbsp olive, vegetable or sunflower oil")
32+
33+
# Assert that only the 3rd token has the `mip_start` feature.
34+
for i, token_features in enumerate(p.sentence_features()):
35+
if i == 2:
36+
assert token_features.get("mip_start", False)
37+
else:
38+
assert not token_features.get("mip_start", False)
39+
40+
def test_mip_start_feature_size(self):
41+
"""
42+
Test that the start of the multi ingredient phrase is correctly identified by
43+
ignoring the size.
44+
"""
45+
p = PreProcessor("1 large sweet or Yukon Gold potato")
46+
47+
# Assert that only the 3rd token has the `mip_start` feature.
48+
for i, token_features in enumerate(p.sentence_features()):
49+
if i == 2:
50+
assert token_features.get("mip_start", False)
51+
else:
52+
assert not token_features.get("mip_start", False)
53+
54+
def test_mip_end_feature(self):
55+
"""
56+
Test that the end of the multi ingredient phrase is correctly identified.
57+
"""
58+
p = PreProcessor("2 tbsp hot chicken or beef stock")
59+
60+
# Assert that only the last token has the `mip_end` feature.
61+
for i, token_features in enumerate(p.sentence_features()):
62+
if i == len(p.sentence_features()) - 1:
63+
assert token_features.get("mip_end", False)
64+
else:
65+
assert not token_features.get("mip_end", False)

0 commit comments

Comments
 (0)