Skip to content

Commit e56614d

Browse files
committed
Merge pull request '2.1.1' (#216) from develop into master
2 parents ec619bb + 63e6530 commit e56614d

File tree

10 files changed

+117
-287
lines changed

10 files changed

+117
-287
lines changed

CHANGELOG.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
# Changelog
22

3-
## 2.10
3+
## 2.1.1
4+
5+
* Pin Pint version to 0.24.4, as future versions intend to drop support for Python 3.10.
6+
7+
## 2.1.0
48

59
> [!WARNING]
610
>

ingredient_parser/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@
99
"show_model_card",
1010
]
1111

12-
__version__ = "2.1.0"
12+
__version__ = "2.1.1"

ingredient_parser/en/_embeddings.py

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,22 @@
88

99

1010
class GloVeModel:
11+
"""Class to interact with GloVe embeddings.
12+
13+
Attributes
14+
----------
15+
binarized_vectors : dict[str, list[str]]
16+
Dict of word: binarized_vector pairs.
17+
vec_file : str
18+
Path to GloVe embeddings file.
19+
vectors : dict[str, np.ndarray]
20+
Dict of word: vector pairs.
21+
"""
22+
1123
def __init__(self, vec_file: str):
1224
self.vec_file = vec_file
1325
self._load_vectors_from_file(vec_file)
26+
self._binarize_vectors()
1427

1528
def __repr__(self) -> str:
1629
return f"GloVeModel(vec_file={self.vec_file})"
@@ -51,7 +64,7 @@ def _load_vectors_from_file(self, vec_file: str) -> None:
5164
"""Load vectors from gzipped txt file in word2vec format.
5265
5366
The first line of the file contains the header which is the vocabulary size
54-
(i.e. number of vectors) and the dimenisions of the vectors.
67+
(i.e. number of vectors) and the dimensions of the vectors.
5568
5669
All remaining rows contain the token followed by the numeric elements of the
5770
vector, separated by a space
@@ -61,7 +74,7 @@ def _load_vectors_from_file(self, vec_file: str) -> None:
6174
vec_file : str
6275
File to load vectors from.
6376
"""
64-
vectors = {}
77+
self.vectors = {}
6578
with as_file(files(__package__) / vec_file) as p:
6679
with gzip.open(p, "rt") as f:
6780
# Read first line as header
@@ -73,6 +86,41 @@ def _load_vectors_from_file(self, vec_file: str) -> None:
7386
parts = line.rstrip().split()
7487
token = parts[0]
7588
vector = np.array([float(v) for v in parts[1:]], dtype=np.float32)
76-
vectors[token] = vector
89+
self.vectors[token] = vector
90+
91+
def _binarize_vectors(self):
92+
"""Binarize word vectors by converting continuous values into discrete values.
93+
94+
For each word vector, calculate the average value of the positive elements and
95+
the negative elements. Replace each element of each word vector according to:
96+
if value < negative_average:
97+
"NEG"
98+
elif value > positive_average
99+
"POS"
100+
else
101+
"0"
77102
78-
self.vectors = vectors
103+
The resulting word vectors are stored in the binarized_vectors attribute.
104+
105+
References
106+
----------
107+
J. Guo, W. Che, H. Wang, and T. Liu, ‘Revisiting Embedding Features for Simple
108+
Semi-supervised Learning’, in Proceedings of the 2014 Conference on Empirical
109+
Methods in Natural Language Processing (EMNLP), Doha, Qatar: Association for
110+
Computational Linguistics, 2014, pp. 110–120. doi: 10.3115/v1/D14-1012.
111+
"""
112+
self.binarized_vectors = {}
113+
for word, vec in self.vectors.items():
114+
positive_avg = np.mean(vec[vec > 0])
115+
negative_avg = np.mean(vec[vec < 0])
116+
117+
binarised_vec = []
118+
for value in vec:
119+
if value < negative_avg:
120+
binarised_vec.append("VNEG")
121+
elif value > positive_avg:
122+
binarised_vec.append("VPOS")
123+
else:
124+
binarised_vec.append("V0")
125+
126+
self.binarized_vectors[word] = binarised_vec

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ classifiers = [
2929
dependencies = [
3030
"nltk>=3.9.1",
3131
"python-crfsuite",
32-
"pint>=0.24.4",
32+
"pint==0.24.4",
3333
"numpy",
3434
]
3535

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
nltk>=3.9.1
22
python-crfsuite
3-
pint>=0.24.4
3+
pint==0.24.4
44
floret

train.py

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -85,12 +85,6 @@ def __call__(self, parser, namespace, values, option_strings):
8585
action="store_true",
8686
help="Plot confusion matrix of token labels.",
8787
)
88-
train_parser.add_argument(
89-
"--model",
90-
choices=["parser", "foundationfoods"],
91-
required=True,
92-
help="Specify which model to train.",
93-
)
9488

9589
multiple_parser_help = "Average CRF performance across multiple training cycles."
9690
multiple_parser = subparsers.add_parser("multiple", help=multiple_parser_help)
@@ -155,12 +149,6 @@ def __call__(self, parser, namespace, values, option_strings):
155149
type=int,
156150
help="Number of processes to spawn. Default to number of cpu cores.",
157151
)
158-
multiple_parser.add_argument(
159-
"--model",
160-
choices=["parser", "foundationfoods"],
161-
required=True,
162-
help="Specify which model to train.",
163-
)
164152

165153
gridsearch_parser_help = (
166154
"Grid search over all combinations of model hyperparameters."
@@ -267,12 +255,6 @@ def __call__(self, parser, namespace, values, option_strings):
267255
action=ParseJsonArg,
268256
default=dict(),
269257
)
270-
gridsearch_parser.add_argument(
271-
"--model",
272-
choices=["parser", "foundationfoods"],
273-
required=True,
274-
help="Specify which model to train.",
275-
)
276258

277259
featuresearch_parser_help = "Grid search over all sets of model features."
278260
featuresearch_parser = subparsers.add_parser(
@@ -329,12 +311,6 @@ def __call__(self, parser, namespace, values, option_strings):
329311
type=int,
330312
help="Seed value used for train/test split.",
331313
)
332-
featuresearch_parser.add_argument(
333-
"--model",
334-
choices=["parser", "foundationfoods"],
335-
required=True,
336-
help="Specify which model to train.",
337-
)
338314

339315
utility_help = "Utilities to aid cleaning training data."
340316
utility_parser = subparsers.add_parser("utility", help=utility_help)

train/featuresearch.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from tabulate import tabulate
1414
from tqdm import tqdm
1515

16-
from .train_model import ModelType, get_model_type
16+
from .train_model import DEFAULT_MODEL_LOCATION
1717
from .training_utils import (
1818
DataVectors,
1919
evaluate,
@@ -71,7 +71,6 @@ def train_model_feature_search(
7171
save_model: str,
7272
seed: int,
7373
keep_model: bool,
74-
model_type: ModelType,
7574
) -> dict:
7675
"""Train model using selected features returning model performance statistics,
7776
model parameters and elapsed training time.
@@ -91,8 +90,6 @@ def train_model_feature_search(
9190
testing sets.
9291
keep_model : bool
9392
If True, keep model after evaluation, otherwise delete it.
94-
model_type : ModelType
95-
Type of model gridsearch is being performed on.
9693
9794
Returns
9895
-------
@@ -140,8 +137,8 @@ def train_model_feature_search(
140137
"feature.minfreq": 0,
141138
"feature.possible_states": True,
142139
"feature.possible_transitions": True,
143-
"c1": 0.25,
144-
"c2": 0.75,
140+
"c1": 0.6,
141+
"c2": 0.5,
145142
"max_linesearch": 5,
146143
"num_memories": 3,
147144
"period": 10,
@@ -157,7 +154,7 @@ def train_model_feature_search(
157154
tagger = pycrfsuite.Tagger() # type: ignore
158155
tagger.open(str(save_model_path))
159156
labels_pred = [tagger.tag(X) for X in features_test]
160-
stats = evaluate(labels_pred, truth_test, seed, model_type)
157+
stats = evaluate(labels_pred, truth_test, seed)
161158

162159
if not keep_model:
163160
save_model_path.unlink(missing_ok=True)
@@ -179,20 +176,22 @@ def feature_search(args: argparse.Namespace):
179176
args : argparse.Namespace
180177
Feature search configuration
181178
"""
182-
vectors = load_datasets(
183-
args.database, args.table, args.datasets, get_model_type(args.model)
184-
)
179+
vectors = load_datasets(args.database, args.table, args.datasets)
180+
181+
if args.save_model is None:
182+
save_model = DEFAULT_MODEL_LOCATION
183+
else:
184+
save_model = args.save_model
185185

186186
argument_sets = []
187187
for feature_set in DISCARDED_FEATURES.keys():
188188
arguments = [
189189
feature_set,
190190
vectors,
191191
args.split,
192-
args.save_model,
192+
save_model,
193193
args.seed,
194194
args.keep_models,
195-
get_model_type(args.model),
196195
]
197196
argument_sets.append(arguments)
198197

train/gridsearch.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from tabulate import tabulate
1515
from tqdm import tqdm
1616

17-
from .train_model import DEFAULT_MODEL_LOCATION, ModelType, get_model_type
17+
from .train_model import DEFAULT_MODEL_LOCATION
1818
from .training_utils import (
1919
DataVectors,
2020
evaluate,
@@ -318,9 +318,7 @@ def generate_argument_sets(args: argparse.Namespace) -> list[list]:
318318
list of lists, where each sublist is the arguments for training a model with
319319
one of the combinations of algorithms and parameters
320320
"""
321-
vectors = load_datasets(
322-
args.database, args.table, args.datasets, get_model_type(args.model)
323-
)
321+
vectors = load_datasets(args.database, args.table, args.datasets)
324322

325323
# Generate list of arguments for all combinations parameters for each algorithm
326324
argument_sets = []
@@ -345,7 +343,7 @@ def generate_argument_sets(args: argparse.Namespace) -> list[list]:
345343
params = params | args.global_params
346344

347345
if args.save_model is None:
348-
save_model = DEFAULT_MODEL_LOCATION[args.model]
346+
save_model = DEFAULT_MODEL_LOCATION
349347
else:
350348
save_model = args.save_model
351349

@@ -359,7 +357,6 @@ def generate_argument_sets(args: argparse.Namespace) -> list[list]:
359357
save_model,
360358
args.seed,
361359
args.keep_models,
362-
get_model_type(args.model),
363360
]
364361
argument_sets.append(arguments)
365362

@@ -374,7 +371,6 @@ def train_model_grid_search(
374371
save_model: str,
375372
seed: int,
376373
keep_model: bool,
377-
model_type: ModelType,
378374
) -> dict:
379375
"""Train model using given training algorithm and parameters,
380376
returning model performance statistics, model parameters and elapsed training time.
@@ -396,8 +392,6 @@ def train_model_grid_search(
396392
testing sets.
397393
keep_model : bool
398394
If True, keep model after evaluation, otherwise delete it.
399-
model_type : ModelType
400-
Type of model gridsearch is being performed on.
401395
402396
Returns
403397
-------
@@ -446,7 +440,7 @@ def train_model_grid_search(
446440
tagger = pycrfsuite.Tagger() # type: ignore
447441
tagger.open(str(save_model_path))
448442
labels_pred = [tagger.tag(X) for X in features_test]
449-
stats = evaluate(labels_pred, truth_test, seed, model_type)
443+
stats = evaluate(labels_pred, truth_test, seed)
450444

451445
if not keep_model:
452446
save_model_path.unlink(missing_ok=True)

0 commit comments

Comments
 (0)