Skip to content

Commit 46b79c2

Browse files
committed
Feature: Only call prepare_embeddings_tokens once at the start of the foundation food matching process, so we can use the prepared tokens to simplify the override matching too.
1 parent e8d6f2c commit 46b79c2

File tree

1 file changed

+23
-23
lines changed

1 file changed

+23
-23
lines changed

ingredient_parser/en/_foundationfoods.py

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import csv
44
import gzip
5-
import string
65
from collections import defaultdict
76
from dataclasses import dataclass
87
from functools import lru_cache
@@ -28,13 +27,6 @@
2827
"Dairy and Egg Products",
2928
"foundation_food",
3029
),
31-
("eggs",): FoundationFood(
32-
"Eggs, Grade A, Large, egg whole",
33-
1,
34-
748967,
35-
"Dairy and Egg Products",
36-
"foundation_food",
37-
),
3830
("butter",): FoundationFood(
3931
"Butter, stick, unsalted",
4032
1,
@@ -49,6 +41,13 @@
4941
"Vegetables and Vegetable Products",
5042
"foundation_food",
5143
),
44+
("garlic",): FoundationFood(
45+
"Garlic, raw",
46+
1,
47+
1104647,
48+
"Vegetables and Vegetable Products",
49+
"foundation_food",
50+
),
5251
}
5352

5453
# List of preferred FDC data types.
@@ -299,15 +298,14 @@ def find_best_match(self, tokens: list[str]) -> list[FDCIngredientMatch]:
299298
Parameters
300299
----------
301300
tokens : list[str]
302-
List of tokens.
301+
List of tokens, prepared for use with embeddings.
303302
304303
Returns
305304
-------
306305
list[FDCIngredientMatch]
307306
List of best matching FDC ingredient for each data type.
308307
"""
309-
prepared_tokens = prepare_embeddings_tokens(tuple(tokens))
310-
input_token_vector = self._embed(prepared_tokens)
308+
input_token_vector = self._embed(tokens)
311309

312310
best_scores = []
313311
for data_type in PREFERRED_DATATYPES:
@@ -518,18 +516,13 @@ def find_best_match(
518516
Parameters
519517
----------
520518
ingredient_name_tokens : list[str]
521-
Token for ingredient name.
519+
Tokens for ingredient name, prepared for use with embeddings.
522520
fdc_ingredients : list[FDCIngredient]
523521
List of candidate FDC ingredients.
524522
"""
525-
prepared_ingredient_name_tokens = prepare_embeddings_tokens(
526-
tuple(ingredient_name_tokens)
527-
)
528523
scored: list[FDCIngredientMatch] = []
529524
for fdc in fdc_ingredients:
530-
score = self._fuzzy_document_distance(
531-
prepared_ingredient_name_tokens, fdc.tokens
532-
)
525+
score = self._fuzzy_document_distance(ingredient_name_tokens, fdc.tokens)
533526
scored.append(FDCIngredientMatch(fdc=fdc, score=score))
534527

535528
sorted_matches = sorted(scored, key=lambda x: x.score)
@@ -573,6 +566,10 @@ def match_foundation_foods(tokens: list[str]) -> FoundationFood | None:
573566
The second stage selects the best of these candidates using a fuzzy embedding
574567
document metric.
575568
569+
The need for two stages is that the ingredient embeddings do not seem to be as
570+
accurate as off the shelf pre-trained general embeddings are for general tasks.
571+
Improving the quality of the embeddings might remove the need for the second stage.
572+
576573
Parameters
577574
----------
578575
tokens : list[str]
@@ -583,15 +580,18 @@ def match_foundation_foods(tokens: list[str]) -> FoundationFood | None:
583580
FoundationFood | None
584581
Matching foundation food, or None if no match can be found.
585582
"""
586-
override_name = tuple(t.lower() for t in tokens if t not in string.punctuation)
587-
if override_name in FOUNDATION_FOOD_OVERRIDES:
588-
return FOUNDATION_FOOD_OVERRIDES[override_name]
583+
prepared_tokens = prepare_embeddings_tokens(tuple(tokens))
584+
585+
if tuple(prepared_tokens) in FOUNDATION_FOOD_OVERRIDES:
586+
return FOUNDATION_FOOD_OVERRIDES[tuple(prepared_tokens)]
589587

590588
u = get_usif_matcher()
591-
candidate_matches = u.find_candidate_matches(tokens)
589+
candidate_matches = u.find_candidate_matches(prepared_tokens)
592590

593591
fuzzy = get_fuzzy_matcher()
594-
best_match = fuzzy.find_best_match(tokens, [m.fdc for m in candidate_matches])
592+
best_match = fuzzy.find_best_match(
593+
prepared_tokens, [m.fdc for m in candidate_matches]
594+
)
595595

596596
if best_match.score <= 0.35:
597597
return FoundationFood(

0 commit comments

Comments
 (0)