Skip to content

Commit c0d315e

Browse files
committed
Refactor: Remove selection of best match using preferred FDC dataset from FuzzyEmbeddingMatcher because the biasing towards "raw" foods achieves the same end goal.
1 parent 2d34345 commit c0d315e

File tree

1 file changed

+1
-56
lines changed

1 file changed

+1
-56
lines changed

ingredient_parser/en/_foundationfoods.py

Lines changed: 1 addition & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -77,14 +77,6 @@
7777
),
7878
}
7979

80-
# List of preferred FDC data types.
81-
# Increasing value indicates decreasing preference.
82-
PREFERRED_DATATYPES = {
83-
"foundation_food": 0, # Most preferred
84-
"survey_fndds_food": 1,
85-
"sr_legacy_food": 2,
86-
}
87-
8880
# Verb stems, the presence of which indicates the food is not raw and therefore should
8981
# not be biased towards a raw food.
9082
NON_RAW_FOOD_VERB_STEMS = {
@@ -565,53 +557,6 @@ def _fuzzy_document_distance(
565557

566558
return 1 - res
567559

568-
def _select_best_match(
569-
self, matches: list[FDCIngredientMatch]
570-
) -> FDCIngredientMatch:
571-
"""Select the best match from the list of sorted candidate matches, accounting
572-
for data type preferences.
573-
574-
Select all matches with scores within 10% of the best score, then iterate
575-
through the data types in order of preference and select the best match from
576-
the most preferred data type where there is a match.
577-
578-
Parameters
579-
----------
580-
matches : list[FDCIngredientMatch]
581-
Sorted list of candidate FDC matches.
582-
583-
Returns
584-
-------
585-
FDCIngredientMatch
586-
Selected FDC ingredient.
587-
"""
588-
if len(matches) == 1:
589-
return matches[0]
590-
591-
best_score = matches[0].score
592-
if best_score == 0:
593-
# Exact match
594-
return matches[0]
595-
596-
# Find other matches with score within 10% of best
597-
alternatives = [
598-
match for match in matches if (match.score - best_score) / best_score <= 0.2
599-
]
600-
if alternatives:
601-
logger.debug(
602-
f"Selecting best match from {len(alternatives)} candidates based on preferred FDC datatype." # noqa
603-
)
604-
for data_type in PREFERRED_DATATYPES:
605-
# Note that these are sorted in order of best score first because the
606-
# alternatives list is sorted.
607-
data_type_matches = [
608-
m for m in alternatives if m.fdc.data_type == data_type
609-
]
610-
if data_type_matches and data_type_matches[0].score <= 0.4:
611-
return data_type_matches[0]
612-
613-
return matches[0]
614-
615560
def find_best_match(
616561
self, ingredient_name_tokens: list[str], fdc_ingredients: list[FDCIngredient]
617562
) -> FDCIngredientMatch:
@@ -646,7 +591,7 @@ def find_best_match(
646591
scored.append(FDCIngredientMatch(fdc=fdc, score=score))
647592

648593
sorted_matches = sorted(scored, key=lambda x: x.score)
649-
return self._select_best_match(sorted_matches)
594+
return sorted_matches[0]
650595

651596

652597
@lru_cache

0 commit comments

Comments
 (0)