Skip to content

Commit 9a1b59a

Browse files
committed
Refactor: When loading FDC ingredients from csv, don't group by data type, just load into a list.
1 parent 34bd8a4 commit 9a1b59a

File tree

1 file changed

+28
-33
lines changed

1 file changed

+28
-33
lines changed

ingredient_parser/en/_foundationfoods.py

Lines changed: 28 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -72,26 +72,25 @@ class FDCIngredientMatch:
7272

7373

7474
@lru_cache
75-
def load_fdc_ingredients() -> dict[str, list[FDCIngredient]]:
75+
def load_fdc_ingredients() -> list[FDCIngredient]:
7676
"""Cached function for loading FDC ingredients from CSV.
7777
7878
Returns
7979
-------
80-
dict[str, list[FDCIngredient]]
81-
List of FDC ingredients, grouped by data type.
80+
list[FDCIngredient]
81+
List of FDC ingredients.
8282
"""
83-
foundation_foods = defaultdict(list)
83+
foundation_foods = []
8484
with as_file(files(__package__) / "fdc_ingredients.csv.gz") as p:
8585
with gzip.open(p, "rt") as f:
8686
reader = csv.DictReader(f)
8787
for row in reader:
88-
data_type = row["data_type"]
8988
tokens = tuple(tokenize(row["description"]))
9089
prepared_tokens = prepare_embeddings_tokens(tokens)
91-
foundation_foods[data_type].append(
90+
foundation_foods.append(
9291
FDCIngredient(
9392
fdc_id=int(row["fdc_id"]),
94-
data_type=data_type,
93+
data_type=row["data_type"],
9594
description=row["description"],
9695
category=row["category"],
9796
tokens=prepared_tokens,
@@ -133,11 +132,11 @@ class uSIF:
133132
Dictionary of token probabilities.
134133
"""
135134

136-
def __init__(self, embeddings, fdc_ingredients: dict[str, list[FDCIngredient]]):
135+
def __init__(self, embeddings, fdc_ingredients: list[FDCIngredient]):
137136
self.embeddings = embeddings
138137
self.embeddings_dimension: int = embeddings.get_dimension()
139138

140-
self.fdc_ingredients: dict[str, list[FDCIngredient]] = fdc_ingredients
139+
self.fdc_ingredients: list[FDCIngredient] = fdc_ingredients
141140
self.token_prob: dict[str, float] = self._estimate_token_probability(
142141
self.fdc_ingredients
143142
)
@@ -147,14 +146,14 @@ def __init__(self, embeddings, fdc_ingredients: dict[str, list[FDCIngredient]]):
147146
self.fdc_vectors = self._embed_fdc_ingredients()
148147

149148
def _estimate_token_probability(
150-
self, fdc_ingredients: dict[str, list[FDCIngredient]]
149+
self, fdc_ingredients: list[FDCIngredient]
151150
) -> dict[str, float]:
152151
"""Estimate word probability from the frequency of occurrence of token in FDC
153152
ingredient descriptions.
154153
155154
Parameters
156155
----------
157-
fdc_ingredients : dict[str, list[FDCIngredient]]
156+
fdc_ingredients : list[FDCIngredient]
158157
List of FDC ingredient objects.
159158
160159
Returns
@@ -163,10 +162,9 @@ def _estimate_token_probability(
163162
Dict of token: probability.
164163
"""
165164
token_counts = defaultdict(int)
166-
for data_type in PREFERRED_DATATYPES:
167-
for ingredient in fdc_ingredients[data_type]:
168-
for token in ingredient.tokens:
169-
token_counts[token] += 1
165+
for ingredient in fdc_ingredients:
166+
for token in ingredient.tokens:
167+
token_counts[token] += 1
170168

171169
total = sum(token_counts.values())
172170
return {token: count / total for token, count in token_counts.items()}
@@ -181,10 +179,9 @@ def _average_sentence_length(self) -> int:
181179
"""
182180
token_count = 0
183181
sentence_count = 0
184-
for data_type in PREFERRED_DATATYPES:
185-
for fdc in self.fdc_ingredients[data_type]:
186-
token_count += len(fdc.tokens)
187-
sentence_count += 1
182+
for fdc in self.fdc_ingredients:
183+
token_count += len(fdc.tokens)
184+
sentence_count += 1
188185

189186
return int(token_count / sentence_count)
190187

@@ -222,18 +219,17 @@ def _weight(self, token: str) -> float:
222219
"""
223220
return self.a / (0.5 * self.a + self.token_prob.get(token, self.min_prob))
224221

225-
def _embed_fdc_ingredients(self) -> dict[str, list[np.ndarray]]:
222+
def _embed_fdc_ingredients(self) -> list[np.ndarray]:
226223
"""Calculate embedding vectors for all FDC ingredients.
227224
228225
Returnstoken_prob
229226
-------
230227
dict[str, list[np.ndarray]]
231228
Dict of embedding vectors for FDC ingredients, grouped by data type.
232229
"""
233-
vectors = defaultdict(list)
234-
for data_type in PREFERRED_DATATYPES:
235-
for fdc in self.fdc_ingredients[data_type]:
236-
vectors[data_type].append(self._embed(fdc.tokens))
230+
vectors = []
231+
for fdc in self.fdc_ingredients:
232+
vectors.append(self._embed(fdc.tokens))
237233

238234
return vectors
239235

@@ -309,16 +305,15 @@ def find_candidate_matches(
309305
input_token_vector = self._embed(prepared_tokens)
310306

311307
candidates = []
312-
for data_type in PREFERRED_DATATYPES:
313-
for idx, vec in enumerate(self.fdc_vectors[data_type]):
314-
score = self._cosine_similarity(input_token_vector, vec)
315-
if score <= cutoff:
316-
candidates.append(
317-
FDCIngredientMatch(
318-
fdc=self.fdc_ingredients[data_type][idx],
319-
score=score,
320-
)
308+
for idx, vec in enumerate(self.fdc_vectors):
309+
score = self._cosine_similarity(input_token_vector, vec)
310+
if score <= cutoff:
311+
candidates.append(
312+
FDCIngredientMatch(
313+
fdc=self.fdc_ingredients[idx],
314+
score=score,
321315
)
316+
)
322317

323318
return candidates
324319

0 commit comments

Comments
 (0)