Skip to content

Commit dfff322

Browse files
jmt-ghstrangetom
andauthored
Process ingredients with prices properly (#33)
This commit looks to add support for preprocessing out prices from ingredients. Co-authored-by: tom <tpstrange@gmail.com> Co-authored-by: Tom Strange <strangetom@users.noreply.github.com>
1 parent 480e1b6 commit dfff322

File tree

3 files changed

+143
-0
lines changed

3 files changed

+143
-0
lines changed

ingredient_parser/en/preprocess.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ def _normalise(self, sentence: str) -> str:
169169
# List of functions to apply to sentence
170170
# Note that the order matters
171171
funcs = [
172+
self._remove_price_annotations,
172173
self._replace_en_em_dash,
173174
self._replace_html_fractions,
174175
self._replace_unicode_fractions,
@@ -187,6 +188,25 @@ def _normalise(self, sentence: str) -> str:
187188
logger.debug(f"{func.__name__}: {sentence}")
188189

189190
return sentence.strip()
191+
192+
def _remove_price_annotations(self, sentence: str) -> str:
193+
"""Remove price annotations like ($0.20), (£1.50), etc. from the sentence.
194+
195+
Parameters
196+
----------
197+
sentence : str
198+
Ingredient sentence
199+
200+
Returns
201+
-------
202+
str
203+
Ingredient sentence with price annotations removed
204+
"""
205+
currencies = ["$", "£", "€", "¥", "₹"]
206+
currency_pattern = "|".join(re.escape(c) for c in currencies)
207+
# Allow optional whitespace after opening parenthesis and before currency, and after currency
208+
pattern = rf"\(\s*(?:{currency_pattern})\s*[0-9.,]+\s*\)"
209+
return re.sub(pattern, "", sentence)
190210

191211
def _replace_en_em_dash(self, sentence: str) -> str:
192212
"""Replace en-dashes and em-dashes with hyphens.

tests/preprocess/test_preprocess.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,28 @@ def test__repr__(self):
2222
p = PreProcessor("1/2 cup chicken broth")
2323
assert repr(p) == 'PreProcessor("1/2 cup chicken broth")'
2424

25+
def test_debug_output(self, capsys):
26+
"""
27+
Test printed debug output
28+
"""
29+
_ = PreProcessor("1/2 cup chicken broth", show_debug_output=True)
30+
captured = capsys.readouterr()
31+
assert (
32+
captured.out
33+
== """_remove_price_annotations: 1/2 cup chicken broth
34+
_replace_en_em_dash: 1/2 cup chicken broth
35+
_replace_html_fractions: 1/2 cup chicken broth
36+
_replace_unicode_fractions: 1/2 cup chicken broth
37+
combine_quantities_split_by_and: 1/2 cup chicken broth
38+
_identify_fractions: #1$2 cup chicken broth
39+
_split_quantity_and_units: #1$2 cup chicken broth
40+
_remove_unit_trailing_period: #1$2 cup chicken broth
41+
replace_string_range: #1$2 cup chicken broth
42+
_replace_dupe_units_ranges: #1$2 cup chicken broth
43+
_merge_quantity_x: #1$2 cup chicken broth
44+
_collapse_ranges: #1$2 cup chicken broth
45+
"""
46+
)
2547

2648
def normalise_test_cases() -> list[tuple[str, ...]]:
2749
"""
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import pytest
2+
3+
from ingredient_parser.en import PreProcessor
4+
5+
@pytest.fixture
6+
def p():
7+
"""Define an empty PreProcessor object to use for testing the PreProcessor class methods."""
8+
return PreProcessor("")
9+
10+
class TestPreProcessor_remove_price_annotations:
11+
def test_remove_dollar_price(self, p):
12+
input_sentence = "1 cup flour ($0.20)"
13+
assert p._remove_price_annotations(input_sentence) == "1 cup flour "
14+
15+
def test_remove_pound_price(self, p):
16+
input_sentence = "2 eggs (£1.50)"
17+
assert p._remove_price_annotations(input_sentence) == "2 eggs "
18+
19+
def test_remove_euro_price(self, p):
20+
input_sentence = "3 tomatoes (€2.00)"
21+
assert p._remove_price_annotations(input_sentence) == "3 tomatoes "
22+
23+
def test_remove_yen_price(self, p):
24+
input_sentence = "1 onion (¥100)"
25+
assert p._remove_price_annotations(input_sentence) == "1 onion "
26+
27+
def test_remove_rupee_price(self, p):
28+
input_sentence = "1 potato (₹10.50)"
29+
assert p._remove_price_annotations(input_sentence) == "1 potato "
30+
31+
def test_multiple_prices(self, p):
32+
input_sentence = "1 apple ($0.50) and 1 orange (£0.30)"
33+
assert p._remove_price_annotations(input_sentence) == "1 apple and 1 orange "
34+
35+
def test_no_price_annotation(self, p):
36+
input_sentence = "1 cup sugar"
37+
assert p._remove_price_annotations(input_sentence) == "1 cup sugar"
38+
39+
def test_malformed_price_annotation(self, p):
40+
input_sentence = "1 cup flour ($0.20"
41+
assert p._remove_price_annotations(input_sentence) == "1 cup flour ($0.20"
42+
43+
def test_price_with_comma(self, p):
44+
input_sentence = "1 steak (€1,200.00)"
45+
assert p._remove_price_annotations(input_sentence) == "1 steak "
46+
47+
def test_price_with_multiple_decimals(self, p):
48+
input_sentence = "1 cheese ($1.99) and 1 bread ($2.49)"
49+
assert p._remove_price_annotations(input_sentence) == "1 cheese and 1 bread "
50+
51+
def test_price_annotation_at_start(self, p):
52+
input_sentence = "($0.20) 1 cup flour"
53+
assert p._remove_price_annotations(input_sentence) == " 1 cup flour"
54+
55+
def test_price_annotation_in_middle(self, p):
56+
input_sentence = "1 cup ($0.20) flour"
57+
assert p._remove_price_annotations(input_sentence) == "1 cup flour"
58+
59+
def test_price_annotation_at_end(self, p):
60+
input_sentence = "1 cup flour ($0.20)"
61+
assert p._remove_price_annotations(input_sentence) == "1 cup flour "
62+
63+
def test_price_annotation_with_leading_space(self, p):
64+
input_sentence = "1 cup flour ( $0.20)"
65+
assert p._remove_price_annotations(input_sentence) == "1 cup flour "
66+
67+
def test_price_annotation_with_inner_spaces(self, p):
68+
input_sentence = "1 cup flour ( $ 0.20 )"
69+
assert p._remove_price_annotations(input_sentence) == "1 cup flour "
70+
71+
def test_price_annotation_with_multiple_spaces(self, p):
72+
input_sentence = "1 cup flour ( $ 0.20 )"
73+
assert p._remove_price_annotations(input_sentence) == "1 cup flour "
74+
75+
def test_price_annotation_with_tab_spaces(self, p):
76+
input_sentence = "1 cup flour (\t$0.20\t)"
77+
assert p._remove_price_annotations(input_sentence) == "1 cup flour "
78+
79+
def test_price_annotation_with_mixed_whitespace(self, p):
80+
input_sentence = "1 cup flour ( \t $ 0.20 )"
81+
assert p._remove_price_annotations(input_sentence) == "1 cup flour "
82+
83+
def test_non_price_parenthetical_remains(self, p):
84+
input_sentence = "1 cup flour (organic)"
85+
assert p._remove_price_annotations(input_sentence) == "1 cup flour (organic)"
86+
87+
def test_multiple_non_price_parentheticals(self, p):
88+
input_sentence = "2 eggs (free-range) (large)"
89+
assert p._remove_price_annotations(input_sentence) == "2 eggs (free-range) (large)"
90+
91+
def test_mixed_price_and_non_price_parentheticals(self, p):
92+
input_sentence = "1 cup flour ($0.20) (organic)"
93+
assert p._remove_price_annotations(input_sentence) == "1 cup flour (organic)"
94+
95+
def test_non_price_parenthetical_with_spaces(self, p):
96+
input_sentence = "1 cup flour ( see note )"
97+
assert p._remove_price_annotations(input_sentence) == "1 cup flour ( see note )"
98+
99+
def test_non_price_parenthetical_with_numbers(self, p):
100+
input_sentence = "1 cup flour (2nd batch)"
101+
assert p._remove_price_annotations(input_sentence) == "1 cup flour (2nd batch)"

0 commit comments

Comments
 (0)