| 
 | 1 | +#!/usr/bin/env python3  | 
 | 2 | + | 
 | 3 | +import nltk  | 
 | 4 | + | 
 | 5 | +from ..dataclasses import Token  | 
 | 6 | +from ._constants import FLATTENED_UNITS_LIST  | 
 | 7 | + | 
 | 8 | + | 
 | 9 | +class MIP:  | 
 | 10 | +    """  | 
 | 11 | +    Multi-ingredient Phrases.  | 
 | 12 | +
  | 
 | 13 | +    This class handles the detection of multi-ingredient phrases in an ingredient  | 
 | 14 | +    sentence, and the generation of features for tokens within the multi-ingredient  | 
 | 15 | +    phrase.  | 
 | 16 | +
  | 
 | 17 | +    A multi-ingredient phrase is a phrase within an ingredient sentence that states a  | 
 | 18 | +    list of alternative ingredients for a give amount. For example  | 
 | 19 | +    * 2 tbsp butter or olive oil  | 
 | 20 | +             ^^^^^^^^^^^^^^^^^^^  | 
 | 21 | +    * 1 cup vegetable, olive or sunflower oil  | 
 | 22 | +            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^  | 
 | 23 | +    """  | 
 | 24 | + | 
 | 25 | +    mip_parser = nltk.RegexpParser(  | 
 | 26 | +        r"""  | 
 | 27 | +        # Extended multi-ingredient phrase containing of 3 ingredients  | 
 | 28 | +        EMIP: {<NN.*|JJ.*>+<,><NN.*|JJ.*>+<CC><NN.*|JJ.*>*<NN.*>}  | 
 | 29 | +        # Multi-ingredient phrase containing of 2 ingredients  | 
 | 30 | +        MIP: {<NN.*|JJ.*>+<CC><NN.*|JJ.*>*<NN.*>}       | 
 | 31 | +        """  | 
 | 32 | +    )  | 
 | 33 | + | 
 | 34 | +    def __init__(self, tokenized_sentence: list[Token]):  | 
 | 35 | +        self.tokenized_sentence = tokenized_sentence  | 
 | 36 | +        self.phrases: list[list[int]] = self.detect_phrases(tokenized_sentence)  | 
 | 37 | + | 
 | 38 | +    def _get_subtree_indices(  | 
 | 39 | +        self, parent_tree: nltk.Tree, subtree: nltk.Tree  | 
 | 40 | +    ) -> list[int]:  | 
 | 41 | +        """Get the indices of a subtree in the parent tree.  | 
 | 42 | +
  | 
 | 43 | +        Parameters  | 
 | 44 | +        ----------  | 
 | 45 | +        parent_tree : nltk.Tree  | 
 | 46 | +            Parent tree to find indices of subtree within.  | 
 | 47 | +        subtree : nltk.Tree  | 
 | 48 | +            Subtree to find within parent tree.  | 
 | 49 | +
  | 
 | 50 | +        Returns  | 
 | 51 | +        -------  | 
 | 52 | +        list[int]  | 
 | 53 | +            List of indices of subtree in parent tree.  | 
 | 54 | +            If not found, return empty list.  | 
 | 55 | +        """  | 
 | 56 | +        parent_leaves = parent_tree.leaves()  | 
 | 57 | +        subtree_leaves = subtree.leaves()  | 
 | 58 | + | 
 | 59 | +        subtree_len = len(subtree_leaves)  | 
 | 60 | +        for i in range(len(parent_leaves) - subtree_len + 1):  | 
 | 61 | +            if (  | 
 | 62 | +                parent_leaves[i] == subtree_leaves[0]  | 
 | 63 | +                and parent_leaves[i : i + subtree_len] == subtree_leaves  | 
 | 64 | +            ):  | 
 | 65 | +                return list(range(i, i + subtree_len))  | 
 | 66 | + | 
 | 67 | +        return []  | 
 | 68 | + | 
 | 69 | +    def _cc_is_not_or(  | 
 | 70 | +        self, text_pos: list[tuple[str, str]], indices: list[int]  | 
 | 71 | +    ) -> bool:  | 
 | 72 | +        """Return True if conjunction in phrase is not "or".  | 
 | 73 | +
  | 
 | 74 | +        Parameters  | 
 | 75 | +        ----------  | 
 | 76 | +        text_pos : list[tuple[str, str]]  | 
 | 77 | +            List of (text, pos) tuples.  | 
 | 78 | +        indices : list[int]  | 
 | 79 | +            Indices of tokens in phrase.  | 
 | 80 | +
  | 
 | 81 | +        Returns  | 
 | 82 | +        -------  | 
 | 83 | +        bool  | 
 | 84 | +            True if phrase conjunction is not "or".  | 
 | 85 | +        """  | 
 | 86 | +        text = [text_pos[i][0] for i in indices]  | 
 | 87 | +        pos = [text_pos[i][1] for i in indices]  | 
 | 88 | +        try:  | 
 | 89 | +            cc_index = pos.index("CC")  | 
 | 90 | +            if text[cc_index].lower() != "or":  | 
 | 91 | +                return True  | 
 | 92 | +            return False  | 
 | 93 | +        except ValueError:  | 
 | 94 | +            return False  | 
 | 95 | + | 
 | 96 | +    def detect_phrases(self, tokenized_sentence: list[Token]) -> list[list[int]]:  | 
 | 97 | +        """Detect multi-ingredient phrases in tokenized sentence.  | 
 | 98 | +
  | 
 | 99 | +        Parameters  | 
 | 100 | +        ----------  | 
 | 101 | +        tokenized_sentence : list[Token]  | 
 | 102 | +            Tokenized sentence to detect phrases within.  | 
 | 103 | +
  | 
 | 104 | +        Returns  | 
 | 105 | +        -------  | 
 | 106 | +        list[list[int]]  | 
 | 107 | +            List of phrases. Each phrase is specified by the indices of the tokens in  | 
 | 108 | +            the tokenized sentence.  | 
 | 109 | +        """  | 
 | 110 | +        phrases = []  | 
 | 111 | + | 
 | 112 | +        text_pos = [(token.text, token.pos_tag) for token in self.tokenized_sentence]  | 
 | 113 | +        parsed = self.mip_parser.parse(text_pos)  | 
 | 114 | +        for subtree in parsed.subtrees(filter=lambda t: t.label() in ["EMIP", "MIP"]):  | 
 | 115 | +            indices = self._get_subtree_indices(parsed, subtree)  | 
 | 116 | +            # If the conjunction is not "or", skip  | 
 | 117 | +            if self._cc_is_not_or(text_pos, indices):  | 
 | 118 | +                continue  | 
 | 119 | + | 
 | 120 | +            # If first item in list is a known unit, remove it.  | 
 | 121 | +            first_idx = indices[0]  | 
 | 122 | +            if self.tokenized_sentence[first_idx].text.lower() in FLATTENED_UNITS_LIST:  | 
 | 123 | +                indices = indices[1:]  | 
 | 124 | +                # If first index is now a conjunction, skip.  | 
 | 125 | +                if self.tokenized_sentence[indices[0]].pos_tag == "CC":  | 
 | 126 | +                    continue  | 
 | 127 | + | 
 | 128 | +            phrases.append(indices)  | 
 | 129 | + | 
 | 130 | +        return phrases  | 
 | 131 | + | 
 | 132 | +    def token_features(self, index: int, prefix: str) -> dict[str, str | bool]:  | 
 | 133 | +        """Return dict of features for token at index.  | 
 | 134 | +
  | 
 | 135 | +        Features:  | 
 | 136 | +        "mip": True if index in phrase.  | 
 | 137 | +        "cc_distance": Distance between index and conjunction in phrase.  | 
 | 138 | +        "mip_start": True if index at start of multi-ingredient phrase.  | 
 | 139 | +        "mip_end": True if index at end of multi-ingredient phrase.  | 
 | 140 | +
  | 
 | 141 | +        Parameters  | 
 | 142 | +        ----------  | 
 | 143 | +        index : int  | 
 | 144 | +            Index of token to return features for.  | 
 | 145 | +        prefix : str  | 
 | 146 | +            Feature label prefix.  | 
 | 147 | +
  | 
 | 148 | +        Returns  | 
 | 149 | +        -------  | 
 | 150 | +        dict[str, str | bool]  | 
 | 151 | +            Dict of features.  | 
 | 152 | +            If index is not in phrase, return empty dict.  | 
 | 153 | +        """  | 
 | 154 | +        features = {}  | 
 | 155 | +        for phrase in self.phrases:  | 
 | 156 | +            if index not in phrase:  | 
 | 157 | +                continue  | 
 | 158 | + | 
 | 159 | +            # features[prefix + "mip"] = True  | 
 | 160 | +            # features[prefix + "cc_distance"] = str(  | 
 | 161 | +            #    self._get_distance_from_cc(phrase, index)  | 
 | 162 | +            # )  | 
 | 163 | + | 
 | 164 | +            # if index == phrase[0]:  | 
 | 165 | +            #    features[prefix + "mip_start"] = True  | 
 | 166 | +            # if index == phrase[-1]:  | 
 | 167 | +            #    features[prefix + "mip_end"] = True  | 
 | 168 | + | 
 | 169 | +            if self._candidate_name_mod(phrase, index):  | 
 | 170 | +                features["name_mod_candidate"] = True  | 
 | 171 | + | 
 | 172 | +        return features  | 
 | 173 | + | 
 | 174 | +    def _get_distance_from_cc(self, phrase: list[int], index: int) -> int:  | 
 | 175 | +        """Calculate distance of index from index of conjunction ("CC") in phrase.  | 
 | 176 | +
  | 
 | 177 | +        Parameters  | 
 | 178 | +        ----------  | 
 | 179 | +        phrase : list[int]  | 
 | 180 | +            Indices of phrase tokens.  | 
 | 181 | +        index : int  | 
 | 182 | +            Index to calculate distance for.  | 
 | 183 | +
  | 
 | 184 | +        Returns  | 
 | 185 | +        -------  | 
 | 186 | +        int  | 
 | 187 | +            Distance from conjunction.  | 
 | 188 | +            If index occurs before conjunction, this value is negative.  | 
 | 189 | +        """  | 
 | 190 | +        phrase_pos_tags = [self.tokenized_sentence[i].pos_tag for i in phrase]  | 
 | 191 | +        cc_index = phrase_pos_tags.index("CC") + phrase[0]  | 
 | 192 | +        return index - cc_index  | 
 | 193 | + | 
 | 194 | +    def _candidate_name_mod(self, phrase: list[int], index: int) -> bool:  | 
 | 195 | +        split_phrase_tokens = list(self._split_phrase(self.tokenized_sentence, phrase))  | 
 | 196 | +        if len(split_phrase_tokens[0]) > 1:  | 
 | 197 | +            return split_phrase_tokens[0][0].index == index  | 
 | 198 | + | 
 | 199 | +        return False  | 
 | 200 | + | 
 | 201 | +    def _split_phrase(self, tokenized_sentence: list[Token], phrase: list[int]):  | 
 | 202 | +        """Yield lists of items from *iterable*, where each list is delimited by  | 
 | 203 | +        an item where callable *pred* returns ``True``.  | 
 | 204 | +
  | 
 | 205 | +            >>> list(split_at("abcdcba", lambda x: x == "b"))  | 
 | 206 | +            [['a'], ['c', 'd', 'c'], ['a']]  | 
 | 207 | +
  | 
 | 208 | +            >>> list(split_at(range(10), lambda n: n % 2 == 1))  | 
 | 209 | +            [[0], [2], [4], [6], [8], []]  | 
 | 210 | +
  | 
 | 211 | +        At most *maxsplit* splits are done. If *maxsplit* is not specified or -1,  | 
 | 212 | +        then there is no limit on the number of splits:  | 
 | 213 | +
  | 
 | 214 | +            >>> list(split_at(range(10), lambda n: n % 2 == 1, maxsplit=2))  | 
 | 215 | +            [[0], [2], [4, 5, 6, 7, 8, 9]]  | 
 | 216 | +
  | 
 | 217 | +        By default, the delimiting items are not included in the output.  | 
 | 218 | +        To include them, set *keep_separator* to ``True``.  | 
 | 219 | +
  | 
 | 220 | +            >>> list(split_at("abcdcba", lambda x: x == "b", keep_separator=True))  | 
 | 221 | +            [['a'], ['b'], ['c', 'd', 'c'], ['b'], ['a']]  | 
 | 222 | +
  | 
 | 223 | +        """  | 
 | 224 | +        phrase_tokens = [tokenized_sentence[i] for i in phrase]  | 
 | 225 | + | 
 | 226 | +        buf = []  | 
 | 227 | +        # it = iter(tokenized_sentence)  | 
 | 228 | +        for token in phrase_tokens:  | 
 | 229 | +            if token.text == "," or token.pos_tag == "CC":  | 
 | 230 | +                yield buf  | 
 | 231 | +                yield [token]  | 
 | 232 | +                buf = []  | 
 | 233 | +            else:  | 
 | 234 | +                buf.append(token)  | 
 | 235 | +        yield buf  | 
0 commit comments