Feature: Remove size tokens from the start of multi-ingredient phrases and add tests for multi-ingredient phrases

strangetom · strangetom · commit 4ce0507afcfb · 2025-06-11T18:36:33.000+01:00
diff --git a/ingredient_parser/en/_constants.py b/ingredient_parser/en/_constants.py
@@ -123,24 +123,6 @@
 # since we need this in a few places
 FLATTENED_UNITS_LIST = set(chain.from_iterable(UNITS.items()))
 
-# Words that can modify a unit
-UNIT_MODIFIERS = [
-    "big",
-    "fat",
-    "generous",
-    "healthy",
-    "heaped",
-    "heaping",
-    "large",
-    "medium",
-    "medium-size",
-    "medium-sized",
-    "scant",
-    "small",
-    "thick",
-    "thin",
-]
-
 # Units that can be part of the name
 # e.g. 1 teaspoon ground cloves, or 5 bay leaves
 AMBIGUOUS_UNITS = [
@@ -163,6 +145,36 @@
 
 AMBIGUOUS_UNITS.extend(_ambiguous_units_alt_forms)
 
+# Words that indicate ingredient size
+SIZES = [
+    "big",
+    "bite-size",
+    "bite-sized",
+    "extra-large",
+    "jumbo",
+    "large",
+    "lg",
+    "little",
+    "md",
+    "medium",
+    "medium-large",
+    "medium-size",
+    "medium-sized",
+    "medium-small",
+    "medium-to-large",
+    "miniature",
+    "regular",
+    "slim",
+    "sm",
+    "small",
+    "small-to-medium",
+    "smaller",
+    "smallest",
+    "thick",
+    "thin",
+    "tiny",
+]
+
 
 # Strings and their numeric representation
 STRING_NUMBERS = {
diff --git a/ingredient_parser/en/_phrases.py b/ingredient_parser/en/_phrases.py
@@ -3,7 +3,7 @@
 import nltk
 
 from ..dataclasses import Token
-from ._constants import FLATTENED_UNITS_LIST
+from ._constants import FLATTENED_UNITS_LIST, SIZES
 
 
 class MIP:
@@ -117,14 +117,20 @@ def detect_phrases(self, tokenized_sentence: list[Token]) -> list[list[int]]:
             if self._cc_is_not_or(text_pos, indices):
                 continue
 
-            # If first item in list is a known unit, remove it.
+            # Remove any units or sizes from the beginning of the phrase
             first_idx = indices[0]
-            # TODO: also exclude sizes e.g. large, small. Needs a list of them.
-            if self.tokenized_sentence[first_idx].text.lower() in FLATTENED_UNITS_LIST:
+            tokens_to_discard = [*FLATTENED_UNITS_LIST, *SIZES]
+            while self.tokenized_sentence[first_idx].text.lower() in tokens_to_discard:
                 indices = indices[1:]
-                # If first index is now a conjunction, skip.
-                if self.tokenized_sentence[indices[0]].pos_tag == "CC":
-                    continue
+                first_idx = indices[0]
+
+            # If phrase is empty, skip.
+            if not indices:
+                continue
+
+            # If first index is now a conjunction, skip.
+            if self.tokenized_sentence[indices[0]].pos_tag == "CC" or not indices:
+                continue
 
             phrases.append(indices)
 
@@ -134,8 +140,6 @@ def token_features(self, index: int, prefix: str) -> dict[str, str | bool]:
         """Return dict of features for token at index.
 
         Features:
-        "mip": True if index in phrase.
-        "cc_distance": Distance between index and conjunction in phrase.
         "mip_start": True if index at start of multi-ingredient phrase.
         "mip_end": True if index at end of multi-ingredient phrase.
 
@@ -157,43 +161,31 @@ def token_features(self, index: int, prefix: str) -> dict[str, str | bool]:
             if index not in phrase:
                 continue
 
-            # features[prefix + "mip"] = True
-            # features[prefix + "cc_distance"] = str(
-            #    self._get_distance_from_cc(phrase, index)
-            # )
-
-            # if index == phrase[0]:
-            #    features[prefix + "mip_start"] = True
-            # if index == phrase[-1]:
-            #    features[prefix + "mip_end"] = True
+            if index == phrase[0]:
+                features[prefix + "mip_start"] = True
 
-            if self._candidate_name_mod(phrase, index):
-                # Token is first element of first subsection of phrase.
-                features[prefix + "name_mod_candidate"] = True
+            if index == phrase[-1]:
+                features[prefix + "mip_end"] = True
 
         return features
 
-    def _get_distance_from_cc(self, phrase: list[int], index: int) -> int:
-        """Calculate distance of index from index of conjunction ("CC") in phrase.
+    def _candidate_name_mod(self, phrase: list[int], index: int) -> bool:
+        """Return True if token at index in phrase is candidate for NAME_MOD label.
+
+        A token is a candidate for NAME_MOD if it is the first element of the phrase.
 
         Parameters
         ----------
         phrase : list[int]
-            Indices of phrase tokens.
+            List of token indices for phrase.
         index : int
-            Index to calculate distance for.
+            Index of token to consider.
 
         Returns
         -------
-        int
-            Distance from conjunction.
-            If index occurs before conjunction, this value is negative.
+        bool
+            True, if token is first in phrase.
         """
-        phrase_pos_tags = [self.tokenized_sentence[i].pos_tag for i in phrase]
-        cc_index = phrase_pos_tags.index("CC") + phrase[0]
-        return index - cc_index
-
-    def _candidate_name_mod(self, phrase: list[int], index: int) -> bool:
         split_phrase_tokens = list(self._split_phrase(self.tokenized_sentence, phrase))
         if len(split_phrase_tokens[0]) > 1:
             return split_phrase_tokens[0][0].index == index
diff --git a/tests/preprocess/test_multi_ingredient_phrases.py b/tests/preprocess/test_multi_ingredient_phrases.py
@@ -0,0 +1,65 @@
+from ingredient_parser.en import PreProcessor
+
+
+class Test_multi_ingredient_phrase_features:
+    def test_multi_ingredient_phrase_detection(self):
+        """
+        Test that multi ingredient phrase is correctly identified.
+        """
+        p = PreProcessor("2 tbsp chicken or beef stock")
+        assert p.mip.phrases == [[2, 3, 4, 5]]
+
+    def test_multi_ingredient_phrase_detection_with_name_mod(self):
+        """
+        Test that multi ingredient phrase with name modifier is correctly identified.
+        """
+        p = PreProcessor("2 tbsp hot chicken or beef stock")
+        assert p.mip.phrases == [[2, 3, 4, 5, 6]]
+
+    def test_extended_multi_ingredient_phrase_detection(self):
+        """
+        Test that extended multi ingredient phrase is correctly identified.
+        """
+        p = PreProcessor("2 tbsp olive, vegetable or sunflower oil")
+        assert p.mip.phrases == [[2, 3, 4, 5, 6, 7]]
+
+    def test_mip_start_feature_unit(self):
+        """
+        Test that the start of the multi ingredient phrase is correctly identified by
+        ignoring the units.
+        """
+        p = PreProcessor("2 tbsp olive, vegetable or sunflower oil")
+
+        # Assert that only the 3rd token has the `mip_start` feature.
+        for i, token_features in enumerate(p.sentence_features()):
+            if i == 2:
+                assert token_features.get("mip_start", False)
+            else:
+                assert not token_features.get("mip_start", False)
+
+    def test_mip_start_feature_size(self):
+        """
+        Test that the start of the multi ingredient phrase is correctly identified by
+        ignoring the size.
+        """
+        p = PreProcessor("1 large sweet or Yukon Gold potato")
+
+        # Assert that only the 3rd token has the `mip_start` feature.
+        for i, token_features in enumerate(p.sentence_features()):
+            if i == 2:
+                assert token_features.get("mip_start", False)
+            else:
+                assert not token_features.get("mip_start", False)
+
+    def test_mip_end_feature(self):
+        """
+        Test that the end of the multi ingredient phrase is correctly identified.
+        """
+        p = PreProcessor("2 tbsp hot chicken or beef stock")
+
+        # Assert that only the last token has the `mip_end` feature.
+        for i, token_features in enumerate(p.sentence_features()):
+            if i == len(p.sentence_features()) - 1:
+                assert token_features.get("mip_end", False)
+            else:
+                assert not token_features.get("mip_end", False)