Skip to content

Commit a119c07

Browse files
committed
Merge pull request 'Glove embeddings' (#210) from glove-embeddings into develop
2 parents 9a1b59a + 106e5ed commit a119c07

16 files changed

+325
-419
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ repos:
55
rev: v5.0.0
66
hooks:
77
- id: check-added-large-files
8+
args: ['--maxkb=5000']
89
stages: [pre-commit]
910
- id: check-ast
1011
stages: [pre-commit]
@@ -15,7 +16,7 @@ repos:
1516
- id: debug-statements
1617
stages: [pre-commit]
1718
- repo: https://github.com/astral-sh/ruff-pre-commit
18-
rev: v0.9.6
19+
rev: v0.11.6
1920
hooks:
2021
- id: ruff
2122
args: ["--fix"]

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,13 @@ The model has the following accuracy on a test data set of 20% of the total data
4848

4949
```
5050
Sentence-level results:
51-
Accuracy: 94.50%
51+
Accuracy: 94.56%
5252
5353
Word-level results:
54-
Accuracy 97.78%
55-
Precision (micro) 97.76%
56-
Recall (micro) 97.78%
57-
F1 score (micro) 97.76%
54+
Accuracy 97.77%
55+
Precision (micro) 97.75%
56+
Recall (micro) 97.77%
57+
F1 score (micro) 97.75%
5858
```
5959

6060
## Development

ingredient_parser/en/ModelCard.en.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ The model has the following performance metrics:
124124

125125
| Word level accuracy | Sentence level accuracy |
126126
| ------------------- | ----------------------- |
127-
| 97.78 ± 0.18% | 94.50 ± 0.42% |
127+
| 97.77 ± 0.18% | 94.56 ± 0.44% |
128128

129129
These metrics were determined by executing 20 training/evaluation cycles and calculating the mean and standard deviation for the two metrics across all cycles. The uncertainty values provided represent the 99.7% confidence bounds (i.e. 3x standard deviation). The uncertainty is due to the randomisation of the selection of training and evaluation data whenever the model is trained.
130130

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#!/usr/bin/env python3
2+
3+
import gzip
4+
from importlib.resources import as_file, files
5+
from typing import Any
6+
7+
import numpy as np
8+
9+
10+
class GloVeModel:
11+
def __init__(self, vec_file: str):
12+
self.vec_file = vec_file
13+
self._load_vectors_from_file(vec_file)
14+
15+
def __repr__(self) -> str:
16+
return f"GloVeModel(vec_file={self.vec_file})"
17+
18+
def __str__(self) -> str:
19+
return f"GloVeModel(vocab_size={self.vocab_size}, dimensions={self.dimension})"
20+
21+
def __len__(self) -> int:
22+
return self.vocab_size
23+
24+
def __contains__(self, token: str) -> bool:
25+
return token in self.vectors
26+
27+
def __getitem__(self, token: str) -> np.ndarray:
28+
return self.vectors[token]
29+
30+
def get(self, token: str, default: Any) -> Any:
31+
"""If token in vector keys, return vector, otherwise return default.
32+
33+
Parameters
34+
----------
35+
token : str
36+
Token to return vector for.
37+
default : Any
38+
Default value if token not in vector keys.
39+
40+
Returns
41+
-------
42+
Any
43+
Vector, or default value.
44+
"""
45+
if token in self.vectors:
46+
return self.vectors[token]
47+
else:
48+
return default
49+
50+
def _load_vectors_from_file(self, vec_file: str) -> None:
51+
"""Load vectors from gzipped txt file in word2vec format.
52+
53+
The first line of the file contains the header which is the vocabulary size
54+
(i.e. number of vectors) and the dimenisions of the vectors.
55+
56+
All remaining rows contain the token followed by the numeric elements of the
57+
vector, separated by a space
58+
59+
Parameters
60+
----------
61+
vec_file : str
62+
File to load vectors from.
63+
"""
64+
vectors = {}
65+
with as_file(files(__package__) / vec_file) as p:
66+
with gzip.open(p, "rt") as f:
67+
# Read first line as header
68+
header = f.readline().rstrip()
69+
self.vocab_size, self.dimension = map(int, header.split())
70+
71+
# Read remaining lines and load vectors
72+
for line in f:
73+
parts = line.rstrip().split()
74+
token = parts[0]
75+
vector = np.array([float(v) for v in parts[1:]], dtype=np.float32)
76+
vectors[token] = vector
77+
78+
self.vectors = vectors

ingredient_parser/en/_foundationfoods.py

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import numpy as np
1111

1212
from ..dataclasses import FoundationFood
13+
from ._embeddings import GloVeModel
1314
from ._loaders import load_embeddings_model
1415
from ._utils import prepare_embeddings_tokens, tokenize
1516

@@ -47,8 +48,8 @@
4748
# Increasing value indicates decreasing preference.
4849
PREFERRED_DATATYPES = {
4950
"foundation_food": 0, # Most preferred
50-
"sr_legacy_food": 1,
51-
"survey_fndds_food": 2,
51+
"survey_fndds_food": 1,
52+
"sr_legacy_food": 2,
5253
}
5354

5455

@@ -118,8 +119,8 @@ class uSIF:
118119
----------
119120
a : float
120121
'a' parameter.
121-
embeddings : floret.floret._floret
122-
Floret embeddings model.
122+
embeddings : GloVeModel
123+
GloVe embeddings model.
123124
embeddings_dimension : int
124125
Dimension of embeddings model.
125126
fdc_ingredients : dict[str, list[FDCIngredient]]
@@ -132,9 +133,9 @@ class uSIF:
132133
Dictionary of token probabilities.
133134
"""
134135

135-
def __init__(self, embeddings, fdc_ingredients: list[FDCIngredient]):
136+
def __init__(self, embeddings: GloVeModel, fdc_ingredients: list[FDCIngredient]):
136137
self.embeddings = embeddings
137-
self.embeddings_dimension: int = embeddings.get_dimension()
138+
self.embeddings_dimension: int = embeddings.dimension
138139

139140
self.fdc_ingredients: list[FDCIngredient] = fdc_ingredients
140141
self.token_prob: dict[str, float] = self._estimate_token_probability(
@@ -284,7 +285,7 @@ def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> float:
284285
)
285286

286287
def find_candidate_matches(
287-
self, tokens: list[str], cutoff: float = 0.3
288+
self, tokens: list[str], n: int
288289
) -> list[FDCIngredientMatch]:
289290
"""Find best candidate matches between input token and FDC ingredients with a
290291
cosine similarity of no more than cutoff.
@@ -293,29 +294,29 @@ def find_candidate_matches(
293294
----------
294295
tokens : list[str]
295296
List of tokens.
296-
cutoff : float
297-
Maximum allowable score of returned matches.
297+
n : int
298+
Number of matches to return, sorted by score.
298299
299300
Returns
300301
-------
301302
list[FDCIngredientMatch]
302-
List of candidate matching FDC ingredient.
303+
List of candidate matching FDC ingredients.
303304
"""
304305
prepared_tokens = prepare_embeddings_tokens(tuple(tokens))
305306
input_token_vector = self._embed(prepared_tokens)
306307

307308
candidates = []
308309
for idx, vec in enumerate(self.fdc_vectors):
309310
score = self._cosine_similarity(input_token_vector, vec)
310-
if score <= cutoff:
311-
candidates.append(
312-
FDCIngredientMatch(
313-
fdc=self.fdc_ingredients[idx],
314-
score=score,
315-
)
311+
candidates.append(
312+
FDCIngredientMatch(
313+
fdc=self.fdc_ingredients[idx],
314+
score=score,
316315
)
316+
)
317317

318-
return candidates
318+
sorted_candidates = sorted(candidates, key=lambda x: x.score)
319+
return sorted_candidates[:n]
319320

320321

321322
class FuzzyEmbeddingMatcher:
@@ -332,11 +333,11 @@ class FuzzyEmbeddingMatcher:
332333
333334
Attributes
334335
----------
335-
embeddings : floret.floret._floret
336+
embeddings : GloVeModel
336337
Floret embeddings model.
337338
"""
338339

339-
def __init__(self, embeddings):
340+
def __init__(self, embeddings: GloVeModel):
340341
self.embeddings = embeddings
341342

342343
@lru_cache
@@ -536,7 +537,6 @@ def find_best_match(
536537

537538
sorted_matches = sorted(scored, key=lambda x: x.score)
538539
return self._select_best_match(sorted_matches)
539-
# return sorted_matches[0]
540540

541541

542542
@lru_cache
@@ -591,12 +591,14 @@ def match_foundation_foods(tokens: list[str]) -> FoundationFood | None:
591591
Matching foundation food, or None if no match can be found.
592592
"""
593593
prepared_tokens = prepare_embeddings_tokens(tuple(tokens))
594+
if not prepared_tokens:
595+
return None
594596

595597
if tuple(prepared_tokens) in FOUNDATION_FOOD_OVERRIDES:
596598
return FOUNDATION_FOOD_OVERRIDES[tuple(prepared_tokens)]
597599

598600
u = get_usif_matcher()
599-
candidate_matches = u.find_candidate_matches(prepared_tokens)
601+
candidate_matches = u.find_candidate_matches(prepared_tokens, n=50)
600602
if not candidate_matches:
601603
return None
602604

ingredient_parser/en/_loaders.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
from functools import lru_cache
44
from importlib.resources import as_file, files
55

6-
import floret
76
import pycrfsuite
87

8+
from ._embeddings import GloVeModel
9+
910

1011
@lru_cache
1112
def load_parser_model() -> pycrfsuite.Tagger: # type: ignore
@@ -26,16 +27,15 @@ def load_parser_model() -> pycrfsuite.Tagger: # type: ignore
2627

2728

2829
@lru_cache
29-
def load_embeddings_model() -> floret.floret._floret: # type: ignore
30+
def load_embeddings_model() -> GloVeModel: # type: ignore
3031
"""Load embeddings model.
3132
3233
This function is cached so that when the model has been loaded once, it does not
3334
need to be loaded again, the cached model is returned.
3435
3536
Returns
3637
-------
37-
floret.floret._floret
38-
Embeddigns model.
38+
GloVeModel
39+
Embeddings model.
3940
"""
40-
with as_file(files(__package__) / "ingredient_embeddings.25d.floret.bin") as p:
41-
return floret.load_model(str(p))
41+
return GloVeModel("ingredient_embeddings.25d.glove.txt.gz")

ingredient_parser/en/_utils.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
import nltk.stem.porter as nsp
1010
import pint
1111

12+
from ingredient_parser.en._loaders import load_embeddings_model
13+
1214
from .._common import UREG, consume, download_nltk_resources, is_float, is_range
1315
from ..dataclasses import IngredientAmount
1416
from ._constants import (
@@ -553,10 +555,13 @@ def prepare_embeddings_tokens(tokens: tuple[str, ...]) -> list[str]:
553555
list[str]
554556
Prepared tokens.
555557
"""
558+
embeddings = load_embeddings_model()
559+
556560
return [
557561
stem(token.lower())
558562
for token in tokens
559-
if not token.isnumeric()
563+
if stem(token.lower()) in embeddings
564+
and not token.isnumeric()
560565
and not token.isdigit()
561566
and not token.isdecimal()
562567
and not token.isspace()
-13.7 MB
Binary file not shown.
3.49 MB
Binary file not shown.
-11.6 KB
Binary file not shown.

0 commit comments

Comments
 (0)