Feature: If an ingredient name ends with a DT, IN or JJ part of speech token, merge the name with the next name.

strangetom · strangetom · commit 5ab0c0600e08 · 2025-08-09T19:31:49.000+01:00
diff --git a/ingredient_parser/en/postprocess.py b/ingredient_parser/en/postprocess.py
@@ -271,31 +271,18 @@ def _postprocess_names(self) -> tuple[list[IngredientText], list[FoundationFood]
 
         name_labels = [self.labels[i] for i in name_idx]
         bio_groups = self._group_name_labels(name_labels)
-        constructed_names = self._construct_names(bio_groups)
-
-        names = []
-        foundation_foods = set()  # Use a set to avoid duplicates
-        for group in constructed_names:
-            # Convert from name_label indices to token indices
-            token_idx = [name_idx[idx] for idx in group]
-            ing_text = self._postprocess_indices(token_idx, "NAME")
-            if ing_text is not None:
-                names.append(ing_text)
-
-                if self.foundation_foods:
-                    tokens = [self.tokens[i] for i in token_idx]
-                    ff = match_foundation_foods(tokens)
-                    if ff:
-                        foundation_foods.add(ff)
-
-        return self._deduplicate_names(names), list(foundation_foods)
+        constructed_names = self._construct_names_from_bio_groups(bio_groups)
+        names, foundation_foods = self._convert_name_indices_to_object(
+            name_idx, constructed_names
+        )
+        return names, foundation_foods
 
     def _deduplicate_names(self, names: list[IngredientText]) -> list[IngredientText]:
         """Deduplicate list of names.
 
         Where the same name text appears in multiple IngredientText objects, the
         confidence values are averaged, and the minimum starting_index is kept for the
-        dedeuplicated names.
+        deduplicated names.
 
         Parameters
         ----------
@@ -305,7 +292,7 @@ def _deduplicate_names(self, names: list[IngredientText]) -> list[IngredientText
         Returns
         -------
         list[IngredientText]
-            Deduplicaed list of names.
+            Deduplicated list of names.
         """
         name_dict = defaultdict(list)
         for name in names:
@@ -381,7 +368,7 @@ def _group_name_labels(self, name_labels: list[str]) -> list[list[tuple[int, str
 
         return name_groups
 
-    def _construct_names(
+    def _construct_names_from_bio_groups(
         self, name_groups: list[list[tuple[int, str]]]
     ) -> list[list[int]]:
         """Construct names from BIO groups.
@@ -435,7 +422,7 @@ def _construct_names(
                     last_encountered_name_used = True
                 else:
                     # If we are here, then we've come across a VAR group that does not
-                    # preceed a TOK group, so the model has made an error in it's
+                    # precede a TOK group, so the model has made an error in it's
                     # labelling. Add this VAR group anyway.
                     constructed_names.append(current_group_idx)
 
@@ -480,6 +467,87 @@ def _get_name_group_label(self, labels: tuple[str]) -> str:
 
         return ""
 
+    def _convert_name_indices_to_object(
+        self, name_idx: list[int], name_indices: list[list[int]]
+    ) -> tuple[list[IngredientText], list[FoundationFood]]:
+        """Convert grouped indices for name tokens into IngredientText objects. If
+        foundation foods are enabled, determine matching foundation food for each name.
+
+        If an ingredient name ends with a token with POS tag of DT, IN or JJ, merge it
+        with the next name group, if there is one. This is to avoid cases in a sentence
+        like "5 fresh large basil leaves" where "large" is given the SIZE label,
+        resulting in two separate names: "fresh" and "basil leaves". Instead, we want to
+        return a single name: "fresh basil leaves".
+
+        Parameters
+        ----------
+        name_idx : list[int]
+            List of indices of NAME tokens.
+        name_indices : list[list[int]]
+            List of groups of indices corresponding to ingredient names.
+
+        Returns
+        -------
+        tuple[list[IngredientText], list[FoundationFood]]
+            List of deduplicated IngredientText objects and FoundationFoods objects.
+        """
+        names = []
+        foundation_foods = set()  # Use a set to avoid duplicates
+
+        # Keep track of IngredientText objects and indices to merge with next.
+        # We do the merge if the name ends with DT, IN, JJ part of speech tag.
+        merge_with_next: IngredientText | None = None
+        merge_with_next_idx: list[int] | None = None
+
+        for group in name_indices:
+            # Convert from name_label indices to token indices
+            token_idx = [name_idx[idx] for idx in group]
+            ing_text = self._postprocess_indices(token_idx, "NAME")
+            if ing_text is None:
+                continue
+
+            if merge_with_next and merge_with_next_idx:
+                # If we need to merge the previous name, do it now.
+                ing_text = IngredientText(
+                    text=merge_with_next.text + " " + ing_text.text,
+                    confidence=(merge_with_next.confidence + ing_text.confidence) / 2,
+                    starting_index=min(
+                        [merge_with_next.starting_index, ing_text.starting_index]
+                    ),
+                )
+                token_idx = [*merge_with_next_idx, *token_idx]
+
+            if self.pos_tags[token_idx[-1]] in {"DT", "IN", "JJ"}:
+                # Mark name for merging with next name.
+                merge_with_next = ing_text
+                merge_with_next_idx = token_idx
+                # Skip to next iteration
+                continue
+            else:
+                names.append(ing_text)
+                merge_with_next = None
+                merge_with_next_idx = None
+
+                if self.foundation_foods:
+                    # Bug: token_idx is wrong here if we merged names
+                    tokens = [self.tokens[i] for i in token_idx]
+                    ff = match_foundation_foods(tokens)
+                    if ff:
+                        foundation_foods.add(ff)
+
+        if merge_with_next and merge_with_next_idx:
+            # Catch any remaining IngredientText objects marked as needing to be merged
+            # but haven't been.
+            names.append(merge_with_next)
+            if self.foundation_foods:
+                # Bug: token_idx is wrong here if we merged names
+                tokens = [self.tokens[i] for i in merge_with_next_idx]
+                ff = match_foundation_foods(tokens)
+                if ff:
+                    foundation_foods.add(ff)
+
+        return self._deduplicate_names(names), list(foundation_foods)
+
     def _postprocess_indices(
         self, label_idx: list[int], selected_label: str
     ) -> IngredientText | None:
diff --git a/tests/postprocess/test_postprocess.py b/tests/postprocess/test_postprocess.py
@@ -41,6 +41,8 @@ def p():
 def p_string_numbers():
     """Define a PostProcessor object with discard_isolated_stop_words set to True
     to use for testing the PostProcessor class methods.
+
+    This sentence includes numbers written as words.
     """
     sentence = "2 butternut squash, about one and one-half pounds each"
     tokens = [
@@ -95,6 +97,8 @@ def p_string_numbers():
 def p_string_numbers_range():
     """Define a PostProcessor object with discard_isolated_stop_words set to True
     to use for testing the PostProcessor class methods.
+
+    This sentence includes a number range written in words.
     """
     sentence = "2 butternut squash, about one or two pounds each"
     tokens = [
@@ -149,6 +153,8 @@ def p_string_numbers_range():
 def p_postprep():
     """Define a PostProcessor object with discard_isolated_stop_words set to False
     to use for testing the PostProcessor class methods.
+
+    This sentence has the name after the preparation instruction.
     """
     sentence = "1 tbsp chopped pistachios"
     tokens = ["1", "tbsp", "chopped", "pistachios"]
@@ -204,6 +210,8 @@ def p_no_discard():
 def p_fraction_in_prep():
     """Define a PostProcessor object for sentence with a fraction in prep
     to use for testing the PostProcessor class methods.
+
+    This sentence includes a fraction in the preparation instructions.
     """
     sentence = "3 carrots, peeled and sliced into 5mm (¼in) coins"
     tokens = [
@@ -278,6 +286,8 @@ def p_fraction_in_prep():
 def p_fraction_range_in_prep():
     """Define a PostProcessor object for sentence with a fraction range in prep
     to use for testing the PostProcessor class methods.
+
+    This sentence includes a number range in the preparation instructions.
     """
     sentence = "3 carrots, peeled and sliced into 5-10mm (¼-½in) coins"
     tokens = [
@@ -348,6 +358,35 @@ def p_fraction_range_in_prep():
     return PostProcessor(sentence, tokens, pos_tags, labels, scores)
 
 
+@pytest.fixture
+def p_split_name():
+    """Define a PostProcessor object with discard_isolated_stop_words set to False
+    to use for testing the PostProcessor class methods.
+
+    This sentence has the name split by a token with a non-name label.
+    """
+    sentence = "5 fresh large basil leaves"
+    tokens = ["5", "fresh", "large", "basil", "leaves"]
+    pos_tags = ["CD", "JJ", "JJ", "NN", "NN"]
+    labels = ["QTY", "B_NAME_TOK", "SIZE", "B_NAME_TOK", "I_NAME_TOK"]
+    scores = [
+        0.99938548647492,
+        0.968725226931013,
+        0.9588222550056443,
+        0.5092435116086577,
+        0.9877923155569212,
+    ]
+
+    return PostProcessor(
+        sentence,
+        tokens,
+        pos_tags,
+        labels,
+        scores,
+        discard_isolated_stop_words=False,
+    )
+
+
 class TestPostProcessor__builtins__:
     def test__str__(self, p):
         """
@@ -490,7 +529,10 @@ def test_string_numbers_range(self, p_string_numbers_range):
         assert p_string_numbers_range.parsed == expected
 
     def test_postprep_amounts(self, p_postprep):
-        """ """
+        """
+        Test fixture returns expected ParsedIngredient object, with the preparation
+        tokens before the ingredient name.
+        """
         expected = ParsedIngredient(
             name=[
                 IngredientText(text="pistachios", confidence=0.998841, starting_index=3)
@@ -558,6 +600,10 @@ def test_no_discard_isolated_stop_words(self, p_no_discard):
         assert p_no_discard.parsed == expected
 
     def test_fraction_in_prep(self, p_fraction_in_prep):
+        """
+        Test fixture returns expected ParsedIngredient object, with the fraction in the
+        preparation instruction retained.
+        """
         expected = ParsedIngredient(
             name=[
                 IngredientText(text="carrots", confidence=0.998212, starting_index=1)
@@ -586,6 +632,10 @@ def test_fraction_in_prep(self, p_fraction_in_prep):
         assert p_fraction_in_prep.parsed == expected
 
     def test_fraction_range_in_prep(self, p_fraction_range_in_prep):
+        """
+        Test fixture returns expected ParsedIngredient object, with the fraction range
+        in the preparation instruction retained.
+        """
         expected = ParsedIngredient(
             name=[
                 IngredientText(text="carrots", confidence=0.998212, starting_index=1)
@@ -612,3 +662,35 @@ def test_fraction_range_in_prep(self, p_fraction_range_in_prep):
         )
 
         assert p_fraction_range_in_prep.parsed == expected
+
+    def test_split_ingredient_name(self, p_split_name):
+        """
+        Test fixture returns expected ParsedIngredient object, with a single name
+        despite a SIZE token splitting the name.
+        """
+        expected = ParsedIngredient(
+            name=[
+                IngredientText(
+                    text="fresh basil leaves",
+                    confidence=0.8586214999999999,
+                    starting_index=1,
+                )
+            ],
+            size=IngredientText(text="large", confidence=0.958822, starting_index=2),
+            amount=[
+                ingredient_amount_factory(
+                    quantity="5",
+                    unit="",
+                    text="5",
+                    confidence=0.999385,
+                    starting_index=0,
+                )
+            ],
+            preparation=None,
+            comment=None,
+            purpose=None,
+            foundation_foods=[],
+            sentence="5 fresh large basil leaves",
+        )
+
+        assert p_split_name.parsed == expected