Skip to content

Commit 5ab0c06

Browse files
committed
Feature: If an ingredient name ends with a DT, IN or JJ part of speech token, merge the name with the next name.
1 parent 2c1ade4 commit 5ab0c06

File tree

2 files changed

+173
-23
lines changed

2 files changed

+173
-23
lines changed

ingredient_parser/en/postprocess.py

Lines changed: 90 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -271,31 +271,18 @@ def _postprocess_names(self) -> tuple[list[IngredientText], list[FoundationFood]
271271

272272
name_labels = [self.labels[i] for i in name_idx]
273273
bio_groups = self._group_name_labels(name_labels)
274-
constructed_names = self._construct_names(bio_groups)
275-
276-
names = []
277-
foundation_foods = set() # Use a set to avoid duplicates
278-
for group in constructed_names:
279-
# Convert from name_label indices to token indices
280-
token_idx = [name_idx[idx] for idx in group]
281-
ing_text = self._postprocess_indices(token_idx, "NAME")
282-
if ing_text is not None:
283-
names.append(ing_text)
284-
285-
if self.foundation_foods:
286-
tokens = [self.tokens[i] for i in token_idx]
287-
ff = match_foundation_foods(tokens)
288-
if ff:
289-
foundation_foods.add(ff)
290-
291-
return self._deduplicate_names(names), list(foundation_foods)
274+
constructed_names = self._construct_names_from_bio_groups(bio_groups)
275+
names, foundation_foods = self._convert_name_indices_to_object(
276+
name_idx, constructed_names
277+
)
278+
return names, foundation_foods
292279

293280
def _deduplicate_names(self, names: list[IngredientText]) -> list[IngredientText]:
294281
"""Deduplicate list of names.
295282
296283
Where the same name text appears in multiple IngredientText objects, the
297284
confidence values are averaged, and the minimum starting_index is kept for the
298-
dedeuplicated names.
285+
deduplicated names.
299286
300287
Parameters
301288
----------
@@ -305,7 +292,7 @@ def _deduplicate_names(self, names: list[IngredientText]) -> list[IngredientText
305292
Returns
306293
-------
307294
list[IngredientText]
308-
Deduplicaed list of names.
295+
Deduplicated list of names.
309296
"""
310297
name_dict = defaultdict(list)
311298
for name in names:
@@ -381,7 +368,7 @@ def _group_name_labels(self, name_labels: list[str]) -> list[list[tuple[int, str
381368

382369
return name_groups
383370

384-
def _construct_names(
371+
def _construct_names_from_bio_groups(
385372
self, name_groups: list[list[tuple[int, str]]]
386373
) -> list[list[int]]:
387374
"""Construct names from BIO groups.
@@ -435,7 +422,7 @@ def _construct_names(
435422
last_encountered_name_used = True
436423
else:
437424
# If we are here, then we've come across a VAR group that does not
438-
# preceed a TOK group, so the model has made an error in it's
425+
# precede a TOK group, so the model has made an error in it's
439426
# labelling. Add this VAR group anyway.
440427
constructed_names.append(current_group_idx)
441428

@@ -480,6 +467,87 @@ def _get_name_group_label(self, labels: tuple[str]) -> str:
480467

481468
return ""
482469

470+
def _convert_name_indices_to_object(
471+
self, name_idx: list[int], name_indices: list[list[int]]
472+
) -> tuple[list[IngredientText], list[FoundationFood]]:
473+
"""Convert grouped indices for name tokens into IngredientText objects. If
474+
foundation foods are enabled, determine matching foundation food for each name.
475+
476+
If an ingredient name ends with a token with POS tag of DT, IN or JJ, merge it
477+
with the next name group, if there is one. This is to avoid cases in a sentence
478+
like "5 fresh large basil leaves" where "large" is given the SIZE label,
479+
resulting in two separate names: "fresh" and "basil leaves". Instead, we want to
480+
return a single name: "fresh basil leaves".
481+
482+
Parameters
483+
----------
484+
name_idx : list[int]
485+
List of indices of NAME tokens.
486+
name_indices : list[list[int]]
487+
List of groups of indices corresponding to ingredient names.
488+
489+
Returns
490+
-------
491+
tuple[list[IngredientText], list[FoundationFood]]
492+
List of deduplicated IngredientText objects and FoundationFoods objects.
493+
"""
494+
names = []
495+
foundation_foods = set() # Use a set to avoid duplicates
496+
497+
# Keep track of IngredientText objects and indices to merge with next.
498+
# We do the merge if the name ends with DT, IN, JJ part of speech tag.
499+
merge_with_next: IngredientText | None = None
500+
merge_with_next_idx: list[int] | None = None
501+
502+
for group in name_indices:
503+
# Convert from name_label indices to token indices
504+
token_idx = [name_idx[idx] for idx in group]
505+
ing_text = self._postprocess_indices(token_idx, "NAME")
506+
if ing_text is None:
507+
continue
508+
509+
if merge_with_next and merge_with_next_idx:
510+
# If we need to merge the previous name, do it now.
511+
ing_text = IngredientText(
512+
text=merge_with_next.text + " " + ing_text.text,
513+
confidence=(merge_with_next.confidence + ing_text.confidence) / 2,
514+
starting_index=min(
515+
[merge_with_next.starting_index, ing_text.starting_index]
516+
),
517+
)
518+
token_idx = [*merge_with_next_idx, *token_idx]
519+
520+
if self.pos_tags[token_idx[-1]] in {"DT", "IN", "JJ"}:
521+
# Mark name for merging with next name.
522+
merge_with_next = ing_text
523+
merge_with_next_idx = token_idx
524+
# Skip to next iteration
525+
continue
526+
else:
527+
names.append(ing_text)
528+
merge_with_next = None
529+
merge_with_next_idx = None
530+
531+
if self.foundation_foods:
532+
# Bug: token_idx is wrong here if we merged names
533+
tokens = [self.tokens[i] for i in token_idx]
534+
ff = match_foundation_foods(tokens)
535+
if ff:
536+
foundation_foods.add(ff)
537+
538+
if merge_with_next and merge_with_next_idx:
539+
# Catch any remaining IngredientText objects marked as needing to be merged
540+
# but haven't been.
541+
names.append(merge_with_next)
542+
if self.foundation_foods:
543+
# Bug: token_idx is wrong here if we merged names
544+
tokens = [self.tokens[i] for i in merge_with_next_idx]
545+
ff = match_foundation_foods(tokens)
546+
if ff:
547+
foundation_foods.add(ff)
548+
549+
return self._deduplicate_names(names), list(foundation_foods)
550+
483551
def _postprocess_indices(
484552
self, label_idx: list[int], selected_label: str
485553
) -> IngredientText | None:

tests/postprocess/test_postprocess.py

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ def p():
4141
def p_string_numbers():
4242
"""Define a PostProcessor object with discard_isolated_stop_words set to True
4343
to use for testing the PostProcessor class methods.
44+
45+
This sentence includes numbers written as words.
4446
"""
4547
sentence = "2 butternut squash, about one and one-half pounds each"
4648
tokens = [
@@ -95,6 +97,8 @@ def p_string_numbers():
9597
def p_string_numbers_range():
9698
"""Define a PostProcessor object with discard_isolated_stop_words set to True
9799
to use for testing the PostProcessor class methods.
100+
101+
This sentence includes a number range written in words.
98102
"""
99103
sentence = "2 butternut squash, about one or two pounds each"
100104
tokens = [
@@ -149,6 +153,8 @@ def p_string_numbers_range():
149153
def p_postprep():
150154
"""Define a PostProcessor object with discard_isolated_stop_words set to False
151155
to use for testing the PostProcessor class methods.
156+
157+
This sentence has the name after the preparation instruction.
152158
"""
153159
sentence = "1 tbsp chopped pistachios"
154160
tokens = ["1", "tbsp", "chopped", "pistachios"]
@@ -204,6 +210,8 @@ def p_no_discard():
204210
def p_fraction_in_prep():
205211
"""Define a PostProcessor object for sentence with a fraction in prep
206212
to use for testing the PostProcessor class methods.
213+
214+
This sentence includes a fraction in the preparation instructions.
207215
"""
208216
sentence = "3 carrots, peeled and sliced into 5mm (¼in) coins"
209217
tokens = [
@@ -278,6 +286,8 @@ def p_fraction_in_prep():
278286
def p_fraction_range_in_prep():
279287
"""Define a PostProcessor object for sentence with a fraction range in prep
280288
to use for testing the PostProcessor class methods.
289+
290+
This sentence includes a number range in the preparation instructions.
281291
"""
282292
sentence = "3 carrots, peeled and sliced into 5-10mm (¼-½in) coins"
283293
tokens = [
@@ -348,6 +358,35 @@ def p_fraction_range_in_prep():
348358
return PostProcessor(sentence, tokens, pos_tags, labels, scores)
349359

350360

361+
@pytest.fixture
362+
def p_split_name():
363+
"""Define a PostProcessor object with discard_isolated_stop_words set to False
364+
to use for testing the PostProcessor class methods.
365+
366+
This sentence has the name split by a token with a non-name label.
367+
"""
368+
sentence = "5 fresh large basil leaves"
369+
tokens = ["5", "fresh", "large", "basil", "leaves"]
370+
pos_tags = ["CD", "JJ", "JJ", "NN", "NN"]
371+
labels = ["QTY", "B_NAME_TOK", "SIZE", "B_NAME_TOK", "I_NAME_TOK"]
372+
scores = [
373+
0.99938548647492,
374+
0.968725226931013,
375+
0.9588222550056443,
376+
0.5092435116086577,
377+
0.9877923155569212,
378+
]
379+
380+
return PostProcessor(
381+
sentence,
382+
tokens,
383+
pos_tags,
384+
labels,
385+
scores,
386+
discard_isolated_stop_words=False,
387+
)
388+
389+
351390
class TestPostProcessor__builtins__:
352391
def test__str__(self, p):
353392
"""
@@ -490,7 +529,10 @@ def test_string_numbers_range(self, p_string_numbers_range):
490529
assert p_string_numbers_range.parsed == expected
491530

492531
def test_postprep_amounts(self, p_postprep):
493-
""" """
532+
"""
533+
Test fixture returns expected ParsedIngredient object, with the preparation
534+
tokens before the ingredient name.
535+
"""
494536
expected = ParsedIngredient(
495537
name=[
496538
IngredientText(text="pistachios", confidence=0.998841, starting_index=3)
@@ -558,6 +600,10 @@ def test_no_discard_isolated_stop_words(self, p_no_discard):
558600
assert p_no_discard.parsed == expected
559601

560602
def test_fraction_in_prep(self, p_fraction_in_prep):
603+
"""
604+
Test fixture returns expected ParsedIngredient object, with the fraction in the
605+
preparation instruction retained.
606+
"""
561607
expected = ParsedIngredient(
562608
name=[
563609
IngredientText(text="carrots", confidence=0.998212, starting_index=1)
@@ -586,6 +632,10 @@ def test_fraction_in_prep(self, p_fraction_in_prep):
586632
assert p_fraction_in_prep.parsed == expected
587633

588634
def test_fraction_range_in_prep(self, p_fraction_range_in_prep):
635+
"""
636+
Test fixture returns expected ParsedIngredient object, with the fraction range
637+
in the preparation instruction retained.
638+
"""
589639
expected = ParsedIngredient(
590640
name=[
591641
IngredientText(text="carrots", confidence=0.998212, starting_index=1)
@@ -612,3 +662,35 @@ def test_fraction_range_in_prep(self, p_fraction_range_in_prep):
612662
)
613663

614664
assert p_fraction_range_in_prep.parsed == expected
665+
666+
def test_split_ingredient_name(self, p_split_name):
667+
"""
668+
Test fixture returns expected ParsedIngredient object, with a single name
669+
despite a SIZE token splitting the name.
670+
"""
671+
expected = ParsedIngredient(
672+
name=[
673+
IngredientText(
674+
text="fresh basil leaves",
675+
confidence=0.8586214999999999,
676+
starting_index=1,
677+
)
678+
],
679+
size=IngredientText(text="large", confidence=0.958822, starting_index=2),
680+
amount=[
681+
ingredient_amount_factory(
682+
quantity="5",
683+
unit="",
684+
text="5",
685+
confidence=0.999385,
686+
starting_index=0,
687+
)
688+
],
689+
preparation=None,
690+
comment=None,
691+
purpose=None,
692+
foundation_foods=[],
693+
sentence="5 fresh large basil leaves",
694+
)
695+
696+
assert p_split_name.parsed == expected

0 commit comments

Comments
 (0)