33import  nltk 
44
55from  ..dataclasses  import  Token 
6- from  ._constants  import  FLATTENED_UNITS_LIST 
6+ from  ._constants  import  FLATTENED_UNITS_LIST ,  SIZES 
77
88
99class  MIP :
@@ -117,14 +117,20 @@ def detect_phrases(self, tokenized_sentence: list[Token]) -> list[list[int]]:
117117            if  self ._cc_is_not_or (text_pos , indices ):
118118                continue 
119119
120-             # If first item in list is a known unit, remove it.  
120+             # Remove any units or sizes from the beginning of the phrase  
121121            first_idx  =  indices [0 ]
122-             # TODO: also exclude sizes e.g. large, small. Needs a list of them. 
123-             if  self .tokenized_sentence [first_idx ].text .lower () in  FLATTENED_UNITS_LIST :
122+             tokens_to_discard   =  [ * FLATTENED_UNITS_LIST ,  * SIZES ] 
123+             while  self .tokenized_sentence [first_idx ].text .lower () in  tokens_to_discard :
124124                indices  =  indices [1 :]
125-                 # If first index is now a conjunction, skip. 
126-                 if  self .tokenized_sentence [indices [0 ]].pos_tag  ==  "CC" :
127-                     continue 
125+                 first_idx  =  indices [0 ]
126+ 
127+             # If phrase is empty, skip. 
128+             if  not  indices :
129+                 continue 
130+ 
131+             # If first index is now a conjunction, skip. 
132+             if  self .tokenized_sentence [indices [0 ]].pos_tag  ==  "CC"  or  not  indices :
133+                 continue 
128134
129135            phrases .append (indices )
130136
@@ -134,8 +140,6 @@ def token_features(self, index: int, prefix: str) -> dict[str, str | bool]:
134140        """Return dict of features for token at index. 
135141
136142        Features: 
137-         "mip": True if index in phrase. 
138-         "cc_distance": Distance between index and conjunction in phrase. 
139143        "mip_start": True if index at start of multi-ingredient phrase. 
140144        "mip_end": True if index at end of multi-ingredient phrase. 
141145
@@ -157,43 +161,31 @@ def token_features(self, index: int, prefix: str) -> dict[str, str | bool]:
157161            if  index  not  in   phrase :
158162                continue 
159163
160-             # features[prefix + "mip"] = True 
161-             # features[prefix + "cc_distance"] = str( 
162-             #    self._get_distance_from_cc(phrase, index) 
163-             # ) 
164- 
165-             # if index == phrase[0]: 
166-             #    features[prefix + "mip_start"] = True 
167-             # if index == phrase[-1]: 
168-             #    features[prefix + "mip_end"] = True 
164+             if  index  ==  phrase [0 ]:
165+                 features [prefix  +  "mip_start" ] =  True 
169166
170-             if  self ._candidate_name_mod (phrase , index ):
171-                 # Token is first element of first subsection of phrase. 
172-                 features [prefix  +  "name_mod_candidate" ] =  True 
167+             if  index  ==  phrase [- 1 ]:
168+                 features [prefix  +  "mip_end" ] =  True 
173169
174170        return  features 
175171
176-     def  _get_distance_from_cc (self , phrase : list [int ], index : int ) ->  int :
177-         """Calculate distance of index from index of conjunction ("CC") in phrase. 
172+     def  _candidate_name_mod (self , phrase : list [int ], index : int ) ->  bool :
173+         """Return True if token at index in phrase is candidate for NAME_MOD label. 
174+ 
175+         A token is a candidate for NAME_MOD if it is the first element of the phrase. 
178176
179177        Parameters 
180178        ---------- 
181179        phrase : list[int] 
182-             Indices  of phrase tokens . 
180+             List  of token indices for phrase . 
183181        index : int 
184-             Index to calculate distance for . 
182+             Index of token to consider . 
185183
186184        Returns 
187185        ------- 
188-         int 
189-             Distance from conjunction. 
190-             If index occurs before conjunction, this value is negative. 
186+         bool 
187+             True, if token is first in phrase. 
191188        """ 
192-         phrase_pos_tags  =  [self .tokenized_sentence [i ].pos_tag  for  i  in  phrase ]
193-         cc_index  =  phrase_pos_tags .index ("CC" ) +  phrase [0 ]
194-         return  index  -  cc_index 
195- 
196-     def  _candidate_name_mod (self , phrase : list [int ], index : int ) ->  bool :
197189        split_phrase_tokens  =  list (self ._split_phrase (self .tokenized_sentence , phrase ))
198190        if  len (split_phrase_tokens [0 ]) >  1 :
199191            return  split_phrase_tokens [0 ][0 ].index  ==  index 
0 commit comments