22
33import  csv 
44import  gzip 
5- import  string 
65from  collections  import  defaultdict 
76from  dataclasses  import  dataclass 
87from  functools  import  lru_cache 
2827        "Dairy and Egg Products" ,
2928        "foundation_food" ,
3029    ),
31-     ("eggs" ,): FoundationFood (
32-         "Eggs, Grade A, Large, egg whole" ,
33-         1 ,
34-         748967 ,
35-         "Dairy and Egg Products" ,
36-         "foundation_food" ,
37-     ),
3830    ("butter" ,): FoundationFood (
3931        "Butter, stick, unsalted" ,
4032        1 ,
4941        "Vegetables and Vegetable Products" ,
5042        "foundation_food" ,
5143    ),
44+     ("garlic" ,): FoundationFood (
45+         "Garlic, raw" ,
46+         1 ,
47+         1104647 ,
48+         "Vegetables and Vegetable Products" ,
49+         "foundation_food" ,
50+     ),
5251}
5352
5453# List of preferred FDC data types. 
@@ -299,15 +298,14 @@ def find_best_match(self, tokens: list[str]) -> list[FDCIngredientMatch]:
299298        Parameters 
300299        ---------- 
301300        tokens : list[str] 
302-             List of tokens. 
301+             List of tokens, prepared for use with embeddings . 
303302
304303        Returns 
305304        ------- 
306305        list[FDCIngredientMatch] 
307306            List of best matching FDC ingredient for each data type. 
308307        """ 
309-         prepared_tokens  =  prepare_embeddings_tokens (tuple (tokens ))
310-         input_token_vector  =  self ._embed (prepared_tokens )
308+         input_token_vector  =  self ._embed (tokens )
311309
312310        best_scores  =  []
313311        for  data_type  in  PREFERRED_DATATYPES :
@@ -518,18 +516,13 @@ def find_best_match(
518516        Parameters 
519517        ---------- 
520518        ingredient_name_tokens : list[str] 
521-             Token  for ingredient name. 
519+             Tokens  for ingredient name, prepared for use with embeddings . 
522520        fdc_ingredients : list[FDCIngredient] 
523521            List of candidate FDC ingredients. 
524522        """ 
525-         prepared_ingredient_name_tokens  =  prepare_embeddings_tokens (
526-             tuple (ingredient_name_tokens )
527-         )
528523        scored : list [FDCIngredientMatch ] =  []
529524        for  fdc  in  fdc_ingredients :
530-             score  =  self ._fuzzy_document_distance (
531-                 prepared_ingredient_name_tokens , fdc .tokens 
532-             )
525+             score  =  self ._fuzzy_document_distance (ingredient_name_tokens , fdc .tokens )
533526            scored .append (FDCIngredientMatch (fdc = fdc , score = score ))
534527
535528        sorted_matches  =  sorted (scored , key = lambda  x : x .score )
@@ -573,6 +566,10 @@ def match_foundation_foods(tokens: list[str]) -> FoundationFood | None:
573566    The second stage selects the best of these candidates using a fuzzy embedding 
574567    document metric. 
575568
569+     The need for two stages is that the ingredient embeddings do not seem to be as 
570+     accurate as off the shelf pre-trained general embeddings are for general tasks. 
571+     Improving the quality of the embeddings might remove the need for the second stage. 
572+ 
576573    Parameters 
577574    ---------- 
578575    tokens : list[str] 
@@ -583,15 +580,18 @@ def match_foundation_foods(tokens: list[str]) -> FoundationFood | None:
583580    FoundationFood | None 
584581        Matching foundation food, or None if no match can be found. 
585582    """ 
586-     override_name  =  tuple (t .lower () for  t  in  tokens  if  t  not  in   string .punctuation )
587-     if  override_name  in  FOUNDATION_FOOD_OVERRIDES :
588-         return  FOUNDATION_FOOD_OVERRIDES [override_name ]
583+     prepared_tokens  =  prepare_embeddings_tokens (tuple (tokens ))
584+ 
585+     if  tuple (prepared_tokens ) in  FOUNDATION_FOOD_OVERRIDES :
586+         return  FOUNDATION_FOOD_OVERRIDES [tuple (prepared_tokens )]
589587
590588    u  =  get_usif_matcher ()
591-     candidate_matches  =  u .find_candidate_matches (tokens )
589+     candidate_matches  =  u .find_candidate_matches (prepared_tokens )
592590
593591    fuzzy  =  get_fuzzy_matcher ()
594-     best_match  =  fuzzy .find_best_match (tokens , [m .fdc  for  m  in  candidate_matches ])
592+     best_match  =  fuzzy .find_best_match (
593+         prepared_tokens , [m .fdc  for  m  in  candidate_matches ]
594+     )
595595
596596    if  best_match .score  <=  0.35 :
597597        return  FoundationFood (
0 commit comments