Skip to content

Commit 693201e

Browse files
committed
Feature: Initial work to make use of embedding with bigrams.
1 parent ab33061 commit 693201e

File tree

5 files changed

+75
-2
lines changed

5 files changed

+75
-2
lines changed

ingredient_parser/en/_foundationfoods.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,12 @@ def load_fdc_ingredients() -> list[FDCIngredient]:
181181
for row in reader:
182182
tokens = tuple(tokenize(row["description"]))
183183
prepared_tokens = prepare_embeddings_tokens(tokens)
184+
if not prepared_tokens:
185+
logger.debug(
186+
f"'{row['description']}' has no tokens in embedding vocabulary."
187+
)
188+
continue
189+
184190
foundation_foods.append(
185191
FDCIngredient(
186192
fdc_id=int(row["fdc_id"]),

ingredient_parser/en/_loaders.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#!/usr/bin/env python3
22

3+
import csv
4+
import gzip
35
import logging
46
from functools import lru_cache
57
from importlib.resources import as_file, files
@@ -44,3 +46,25 @@ def load_embeddings_model() -> GloVeModel: # type: ignore
4446
"""
4547
logger.debug("Loading embeddings model: ingredient_embeddings.25d.glove.txt.gz")
4648
return GloVeModel("data/ingredient_embeddings.25d.glove.txt.gz")
49+
50+
51+
@lru_cache
52+
def load_embeddings_bigrams() -> set[tuple[str, str]]:
53+
"""Load embeddings bigrams from csv file..
54+
55+
The bigrams are stored in pairs in a csv file.
56+
57+
Returns
58+
-------
59+
set[tuple[str, str]]
60+
Set of bigram tuples.
61+
"""
62+
logger.debug("Loading embeddings bigrams: bigrams.csv.gz")
63+
bigrams = set()
64+
with as_file(files(__package__) / "data/bigrams.csv.gz") as p:
65+
with gzip.open(p, "rt") as f:
66+
reader = csv.reader(f)
67+
for row in reader:
68+
bigrams.add(tuple(row))
69+
70+
return bigrams

ingredient_parser/en/_utils.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import nltk.stem.porter as nsp
1010
import pint
1111

12-
from ingredient_parser.en._loaders import load_embeddings_model
12+
from ingredient_parser.en._loaders import load_embeddings_bigrams, load_embeddings_model
1313

1414
from .._common import UREG, consume, download_nltk_resources, is_float, is_range
1515
from ..dataclasses import IngredientAmount
@@ -557,7 +557,7 @@ def prepare_embeddings_tokens(tokens: tuple[str, ...]) -> list[str]:
557557
"""
558558
embeddings = load_embeddings_model()
559559

560-
return [
560+
prepared_tokens = [
561561
stem(token.lower())
562562
for token in tokens
563563
if stem(token.lower()) in embeddings
@@ -569,3 +569,46 @@ def prepare_embeddings_tokens(tokens: tuple[str, ...]) -> list[str]:
569569
and token not in STOP_WORDS
570570
and len(token) > 1
571571
]
572+
return join_bigrams(prepared_tokens)
573+
574+
575+
def join_bigrams(tokens: list[str]) -> list[str]:
576+
"""Join bigrams in tokens list with underscore.
577+
578+
Provided tokens should already been stemmed and had stop words, numeric tokens,
579+
punctuation and single character tokens removed.
580+
Provided tokens should only be nouns, verbs, adjectives, adverbs or foreign
581+
words.
582+
583+
Parameters
584+
----------
585+
tokens : list[str]
586+
List of tokens.
587+
588+
Returns
589+
-------
590+
list[str]
591+
List of tokens, with bigrams joined by underscore
592+
593+
Examples
594+
--------
595+
>>> join_bigrams(["cup", "confectioners", "sugar"])
596+
["cup", "confectioners", "confectioners_sugar", "sugar"]
597+
"""
598+
bigrams = load_embeddings_bigrams()
599+
600+
joined_tokens = []
601+
consumed = None
602+
for i, token in enumerate(tokens):
603+
joined_tokens.append(token)
604+
if i == consumed:
605+
consumed = None
606+
continue
607+
608+
if i < len(tokens) - 1:
609+
candidate_bigram = (token, tokens[i + 1])
610+
if candidate_bigram in bigrams:
611+
joined_tokens.append("_".join(candidate_bigram))
612+
consumed = i + 1
613+
614+
return joined_tokens
42.6 KB
Binary file not shown.
1.11 MB
Binary file not shown.

0 commit comments

Comments
 (0)