| 
77 | 77 |     ),  | 
78 | 78 | }  | 
79 | 79 | 
 
  | 
80 |  | -# List of preferred FDC data types.  | 
81 |  | -# Increasing value indicates decreasing preference.  | 
82 |  | -PREFERRED_DATATYPES = {  | 
83 |  | -    "foundation_food": 0,  #  Most preferred  | 
84 |  | -    "survey_fndds_food": 1,  | 
85 |  | -    "sr_legacy_food": 2,  | 
86 |  | -}  | 
87 |  | - | 
88 | 80 | # Verb stems, the presence of which indicates the food is not raw and therefore should  | 
89 | 81 | # not be biased towards a raw food.  | 
90 | 82 | NON_RAW_FOOD_VERB_STEMS = {  | 
@@ -565,53 +557,6 @@ def _fuzzy_document_distance(  | 
565 | 557 | 
 
  | 
566 | 558 |         return 1 - res  | 
567 | 559 | 
 
  | 
568 |  | -    def _select_best_match(  | 
569 |  | -        self, matches: list[FDCIngredientMatch]  | 
570 |  | -    ) -> FDCIngredientMatch:  | 
571 |  | -        """Select the best match from the list of sorted candidate matches, accounting  | 
572 |  | -        for data type preferences.  | 
573 |  | -
  | 
574 |  | -        Select all matches with scores within 10% of the best score, then iterate  | 
575 |  | -        through the data types in order of preference and select the best match from  | 
576 |  | -        the most preferred data type where there is a match.  | 
577 |  | -
  | 
578 |  | -        Parameters  | 
579 |  | -        ----------  | 
580 |  | -        matches : list[FDCIngredientMatch]  | 
581 |  | -            Sorted list of candidate FDC matches.  | 
582 |  | -
  | 
583 |  | -        Returns  | 
584 |  | -        -------  | 
585 |  | -        FDCIngredientMatch  | 
586 |  | -            Selected FDC ingredient.  | 
587 |  | -        """  | 
588 |  | -        if len(matches) == 1:  | 
589 |  | -            return matches[0]  | 
590 |  | - | 
591 |  | -        best_score = matches[0].score  | 
592 |  | -        if best_score == 0:  | 
593 |  | -            # Exact match  | 
594 |  | -            return matches[0]  | 
595 |  | - | 
596 |  | -        # Find other matches with score within 10% of best  | 
597 |  | -        alternatives = [  | 
598 |  | -            match for match in matches if (match.score - best_score) / best_score <= 0.2  | 
599 |  | -        ]  | 
600 |  | -        if alternatives:  | 
601 |  | -            logger.debug(  | 
602 |  | -                f"Selecting best match from {len(alternatives)} candidates based on preferred FDC datatype."  # noqa  | 
603 |  | -            )  | 
604 |  | -        for data_type in PREFERRED_DATATYPES:  | 
605 |  | -            # Note that these are sorted in order of best score first because the  | 
606 |  | -            # alternatives list is sorted.  | 
607 |  | -            data_type_matches = [  | 
608 |  | -                m for m in alternatives if m.fdc.data_type == data_type  | 
609 |  | -            ]  | 
610 |  | -            if data_type_matches and data_type_matches[0].score <= 0.4:  | 
611 |  | -                return data_type_matches[0]  | 
612 |  | - | 
613 |  | -        return matches[0]  | 
614 |  | - | 
615 | 560 |     def find_best_match(  | 
616 | 561 |         self, ingredient_name_tokens: list[str], fdc_ingredients: list[FDCIngredient]  | 
617 | 562 |     ) -> FDCIngredientMatch:  | 
@@ -646,7 +591,7 @@ def find_best_match(  | 
646 | 591 |             scored.append(FDCIngredientMatch(fdc=fdc, score=score))  | 
647 | 592 | 
 
  | 
648 | 593 |         sorted_matches = sorted(scored, key=lambda x: x.score)  | 
649 |  | -        return self._select_best_match(sorted_matches)  | 
 | 594 | +        return sorted_matches[0]  | 
650 | 595 | 
 
  | 
651 | 596 | 
 
  | 
652 | 597 | @lru_cache  | 
 | 
0 commit comments