@@ -72,26 +72,25 @@ class FDCIngredientMatch:
7272
7373
7474@lru_cache  
75- def  load_fdc_ingredients () ->  dict [ str ,  list [FDCIngredient ] ]:
75+ def  load_fdc_ingredients () ->  list [FDCIngredient ]:
7676    """Cached function for loading FDC ingredients from CSV. 
7777
7878    Returns 
7979    ------- 
80-     dict[str,  list[FDCIngredient] ] 
81-         List of FDC ingredients, grouped by data type . 
80+     list[FDCIngredient] 
81+         List of FDC ingredients. 
8282    """ 
83-     foundation_foods  =  defaultdict ( list ) 
83+     foundation_foods  =  [] 
8484    with  as_file (files (__package__ ) /  "fdc_ingredients.csv.gz" ) as  p :
8585        with  gzip .open (p , "rt" ) as  f :
8686            reader  =  csv .DictReader (f )
8787            for  row  in  reader :
88-                 data_type  =  row ["data_type" ]
8988                tokens  =  tuple (tokenize (row ["description" ]))
9089                prepared_tokens  =  prepare_embeddings_tokens (tokens )
91-                 foundation_foods [ data_type ] .append (
90+                 foundation_foods .append (
9291                    FDCIngredient (
9392                        fdc_id = int (row ["fdc_id" ]),
94-                         data_type = data_type ,
93+                         data_type = row [ " data_type" ] ,
9594                        description = row ["description" ],
9695                        category = row ["category" ],
9796                        tokens = prepared_tokens ,
@@ -133,11 +132,11 @@ class uSIF:
133132        Dictionary of token probabilities. 
134133    """ 
135134
136-     def  __init__ (self , embeddings , fdc_ingredients : dict [ str ,  list [FDCIngredient ] ]):
135+     def  __init__ (self , embeddings , fdc_ingredients : list [FDCIngredient ]):
137136        self .embeddings  =  embeddings 
138137        self .embeddings_dimension : int  =  embeddings .get_dimension ()
139138
140-         self .fdc_ingredients : dict [ str ,  list [FDCIngredient ] ] =  fdc_ingredients 
139+         self .fdc_ingredients : list [FDCIngredient ] =  fdc_ingredients 
141140        self .token_prob : dict [str , float ] =  self ._estimate_token_probability (
142141            self .fdc_ingredients 
143142        )
@@ -147,14 +146,14 @@ def __init__(self, embeddings, fdc_ingredients: dict[str, list[FDCIngredient]]):
147146        self .fdc_vectors  =  self ._embed_fdc_ingredients ()
148147
149148    def  _estimate_token_probability (
150-         self , fdc_ingredients : dict [ str ,  list [FDCIngredient ] ]
149+         self , fdc_ingredients : list [FDCIngredient ]
151150    ) ->  dict [str , float ]:
152151        """Estimate word probability from the frequency of occurrence of token in FDC 
153152        ingredient descriptions. 
154153
155154        Parameters 
156155        ---------- 
157-         fdc_ingredients : dict[str,  list[FDCIngredient] ] 
156+         fdc_ingredients : list[FDCIngredient] 
158157            List of FDC ingredient objects. 
159158
160159        Returns 
@@ -163,10 +162,9 @@ def _estimate_token_probability(
163162            Dict of token: probability. 
164163        """ 
165164        token_counts  =  defaultdict (int )
166-         for  data_type  in  PREFERRED_DATATYPES :
167-             for  ingredient  in  fdc_ingredients [data_type ]:
168-                 for  token  in  ingredient .tokens :
169-                     token_counts [token ] +=  1 
165+         for  ingredient  in  fdc_ingredients :
166+             for  token  in  ingredient .tokens :
167+                 token_counts [token ] +=  1 
170168
171169        total  =  sum (token_counts .values ())
172170        return  {token : count  /  total  for  token , count  in  token_counts .items ()}
@@ -181,10 +179,9 @@ def _average_sentence_length(self) -> int:
181179        """ 
182180        token_count  =  0 
183181        sentence_count  =  0 
184-         for  data_type  in  PREFERRED_DATATYPES :
185-             for  fdc  in  self .fdc_ingredients [data_type ]:
186-                 token_count  +=  len (fdc .tokens )
187-                 sentence_count  +=  1 
182+         for  fdc  in  self .fdc_ingredients :
183+             token_count  +=  len (fdc .tokens )
184+             sentence_count  +=  1 
188185
189186        return  int (token_count  /  sentence_count )
190187
@@ -222,18 +219,17 @@ def _weight(self, token: str) -> float:
222219        """ 
223220        return  self .a  /  (0.5  *  self .a  +  self .token_prob .get (token , self .min_prob ))
224221
225-     def  _embed_fdc_ingredients (self ) ->  dict [ str ,  list [np .ndarray ] ]:
222+     def  _embed_fdc_ingredients (self ) ->  list [np .ndarray ]:
226223        """Calculate embedding vectors for all FDC ingredients. 
227224
228225        Returnstoken_prob 
229226        ------- 
230227        dict[str, list[np.ndarray]] 
231228            Dict of embedding vectors for FDC ingredients, grouped by data type. 
232229        """ 
233-         vectors  =  defaultdict (list )
234-         for  data_type  in  PREFERRED_DATATYPES :
235-             for  fdc  in  self .fdc_ingredients [data_type ]:
236-                 vectors [data_type ].append (self ._embed (fdc .tokens ))
230+         vectors  =  []
231+         for  fdc  in  self .fdc_ingredients :
232+             vectors .append (self ._embed (fdc .tokens ))
237233
238234        return  vectors 
239235
@@ -309,16 +305,15 @@ def find_candidate_matches(
309305        input_token_vector  =  self ._embed (prepared_tokens )
310306
311307        candidates  =  []
312-         for  data_type  in  PREFERRED_DATATYPES :
313-             for  idx , vec  in  enumerate (self .fdc_vectors [data_type ]):
314-                 score  =  self ._cosine_similarity (input_token_vector , vec )
315-                 if  score  <=  cutoff :
316-                     candidates .append (
317-                         FDCIngredientMatch (
318-                             fdc = self .fdc_ingredients [data_type ][idx ],
319-                             score = score ,
320-                         )
308+         for  idx , vec  in  enumerate (self .fdc_vectors ):
309+             score  =  self ._cosine_similarity (input_token_vector , vec )
310+             if  score  <=  cutoff :
311+                 candidates .append (
312+                     FDCIngredientMatch (
313+                         fdc = self .fdc_ingredients [idx ],
314+                         score = score ,
321315                    )
316+                 )
322317
323318        return  candidates 
324319
0 commit comments