Skip to content

Commit 50c12f3

Browse files
authored
Drop tqdm in favor of logger; normalize use of tabulate (#35)
* Drop tqdm in favor of logger; normalize use of tabulate Changes are to accommodate #25, as webtools will need basic logging to handle websocket logging behavior required for front-end * Missed reference * Preserve logging bypass from existing branch * Remove flask web package dependencies until PR#25 * Introduce decorate fn for redirecting logsteam to be later used in web app/tool Refer to notes on other PR at 371a060 * Formatting fixes with pre-commit
1 parent cef3e59 commit 50c12f3

File tree

6 files changed

+162
-43
lines changed

6 files changed

+162
-43
lines changed

requirements-dev.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@ twine
66
pre-commit
77
pytest
88
coverage
9-
tqdm
10-
Flask
9+
flask
1110
tabulate
1211
matplotlib
1312
-e .

train/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
from .clean__check_label_consistency import check_label_consistency
22
from .featuresearch import feature_search
33
from .gridsearch import grid_search
4-
from .train_model import train_multiple, train_single
4+
from .train_model import set_redirect_log_stream, train_multiple, train_single
55

66
__all__ = [
77
"check_label_consistency",
88
"feature_search",
99
"grid_search",
10+
"set_redirect_log_stream",
1011
"train_multiple",
1112
"train_single",
1213
]

train/featuresearch.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@
1212
import pycrfsuite
1313
from sklearn.model_selection import train_test_split
1414
from tabulate import tabulate
15-
from tqdm import tqdm
1615

1716
from .train_model import DEFAULT_MODEL_LOCATION
1817
from .training_utils import (
1918
DataVectors,
19+
convert_num_ordinal,
2020
evaluate,
2121
load_datasets,
2222
)
@@ -211,14 +211,17 @@ def feature_search(args: argparse.Namespace):
211211
logger.info(f"Grid search over {len(argument_sets)} feature sets.")
212212
logger.info(f"{args.seed} is the random seed used for the train/test split.")
213213

214+
eval_results = []
214215
with cf.ProcessPoolExecutor(max_workers=args.processes) as executor:
215216
futures = [
216217
executor.submit(train_model_feature_search, *a) for a in argument_sets
217218
]
218-
eval_results = [
219-
future.result()
220-
for future in tqdm(cf.as_completed(futures), total=len(futures))
221-
]
219+
logger.info(
220+
f"Queued for separate runs against {len(argument_sets)} feature sets"
221+
)
222+
for idx, future in enumerate(cf.as_completed(futures)):
223+
logger.info(f"{convert_num_ordinal(idx + 1)} set completed")
224+
eval_results.append(future.result())
222225

223226
# Sort with highest sentence accuracy first
224227
eval_results = sorted(
@@ -248,4 +251,14 @@ def feature_search(args: argparse.Namespace):
248251
]
249252
)
250253

251-
print(tabulate(table, headers=headers, tablefmt="simple_outline"))
254+
print(
255+
"\n"
256+
+ tabulate(
257+
table,
258+
headers=headers,
259+
tablefmt="fancy_grid",
260+
stralign="left",
261+
numalign="right",
262+
)
263+
+ "\n"
264+
)

train/gridsearch.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@
1313
import pycrfsuite
1414
from sklearn.model_selection import train_test_split
1515
from tabulate import tabulate
16-
from tqdm import tqdm
1716

1817
from .train_model import DEFAULT_MODEL_LOCATION
1918
from .training_utils import (
2019
DataVectors,
20+
convert_num_ordinal,
2121
evaluate,
2222
load_datasets,
2323
)
@@ -499,12 +499,13 @@ def grid_search(args: argparse.Namespace):
499499
logger.info(f"Grid search over {len(arguments)} hyperparameters combinations.")
500500
logger.info(f"{args.seed} is the random seed used for the train/test split.")
501501

502+
eval_results = []
502503
with cf.ProcessPoolExecutor(max_workers=args.processes) as executor:
503504
futures = [executor.submit(train_model_grid_search, *a) for a in arguments]
504-
eval_results = [
505-
future.result()
506-
for future in tqdm(cf.as_completed(futures), total=len(futures))
507-
]
505+
logger.info(f"Queued for separate runs against {len(args.algos)} algorithms")
506+
for idx, future in enumerate(cf.as_completed(futures)):
507+
logger.info(f"{convert_num_ordinal(idx + 1)} algorithm completed")
508+
eval_results.append(future.result())
508509

509510
# Sort with highest sentence accuracy first
510511
eval_results = sorted(
@@ -538,10 +539,14 @@ def grid_search(args: argparse.Namespace):
538539
)
539540

540541
print(
541-
tabulate(
542+
"\n"
543+
+ tabulate(
542544
table,
543545
headers=headers,
544546
tablefmt="fancy_grid",
545547
maxcolwidths=[None, 130, None, None, None, None],
548+
stralign="left",
549+
numalign="right",
546550
)
551+
+ "\n"
547552
)

train/train_model.py

Lines changed: 109 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@
77
from pathlib import Path
88
from random import randint
99
from statistics import mean, stdev
10-
from typing import Generator
10+
from typing import Generator, TextIO
1111
from uuid import uuid4
1212

1313
import pycrfsuite
1414
from sklearn.model_selection import train_test_split
15-
from tqdm import tqdm
15+
from tabulate import tabulate
1616

1717
from .test_results_to_detailed_results import test_results_to_detailed_results
1818
from .test_results_to_html import test_results_to_html
@@ -21,6 +21,7 @@
2121
DataVectors,
2222
Stats,
2323
confusion_matrix,
24+
convert_num_ordinal,
2425
evaluate,
2526
load_datasets,
2627
)
@@ -29,7 +30,7 @@
2930

3031

3132
@contextmanager
32-
def change_log_level(level: int) -> Generator[None]:
33+
def change_log_level(level: int) -> Generator[None, None, None]:
3334
"""Context manager to temporarily change logging level within the context.
3435
3536
On exiting the context, the original level is restored.
@@ -50,6 +51,30 @@ def change_log_level(level: int) -> Generator[None]:
5051
logger.setLevel(original_level)
5152

5253

54+
@contextmanager
55+
def set_redirect_log_stream(io_stream: TextIO) -> Generator[None, None, None]:
56+
"""Context manager to accept io_stream for logging used in web app
57+
Required by web app as it bypasses train.py where logging is configured
58+
59+
Parameters
60+
----------
61+
io_stream : TextIO
62+
io.IOString() stream
63+
64+
Yields
65+
------
66+
Generator[None, None, None]
67+
Generator, yielding None
68+
"""
69+
logging.basicConfig(
70+
stream=io_stream,
71+
level=logging.INFO,
72+
format="[%(levelname)s] (%(module)s) %(message)s",
73+
)
74+
75+
yield
76+
77+
5378
def train_parser_model(
5479
vectors: DataVectors,
5580
split: float,
@@ -189,6 +214,15 @@ def train_parser_model(
189214
return stats
190215

191216

217+
def train_parser_model_bypass_logging(*kargs) -> Stats:
218+
stats = None
219+
with change_log_level(
220+
logging.WARNING
221+
): # Temporarily stop logging below WARNING for multi-processing
222+
stats = train_parser_model(*kargs)
223+
return stats
224+
225+
192226
def train_single(args: argparse.Namespace) -> None:
193227
"""Train CRF model once.
194228
@@ -222,15 +256,31 @@ def train_single(args: argparse.Namespace) -> None:
222256
combine_name_labels=args.combine_name_labels,
223257
)
224258

225-
print("Sentence-level results:")
226-
print(f"\tAccuracy: {100 * stats.sentence.accuracy:.2f}%")
259+
headers = ["Sentence-level results", "Word-level results"]
260+
table = []
261+
262+
table.append(
263+
[
264+
f"Accuracy: {100 * stats.sentence.accuracy:.2f}%",
265+
f"Accuracy: {100 * stats.token.accuracy:.2f}%\n"
266+
f"Precision (micro) {100 * stats.token.weighted_avg.precision:.2f}%\n"
267+
f"Recall (micro) {100 * stats.token.weighted_avg.recall:.2f}%\n"
268+
f"F1 score (micro) {100 * stats.token.weighted_avg.f1_score:.2f}%",
269+
]
270+
)
227271

228-
print()
229-
print("Word-level results:")
230-
print(f"\tAccuracy {100 * stats.token.accuracy:.2f}%")
231-
print(f"\tPrecision (micro) {100 * stats.token.weighted_avg.precision:.2f}%")
232-
print(f"\tRecall (micro) {100 * stats.token.weighted_avg.recall:.2f}%")
233-
print(f"\tF1 score (micro) {100 * stats.token.weighted_avg.f1_score:.2f}%")
272+
print(
273+
"\n"
274+
+ tabulate(
275+
table,
276+
headers=headers,
277+
tablefmt="fancy_grid",
278+
maxcolwidths=[None, None],
279+
stralign="left",
280+
numalign="right",
281+
)
282+
+ "\n"
283+
)
234284

235285

236286
def train_multiple(args: argparse.Namespace) -> None:
@@ -272,13 +322,15 @@ def train_multiple(args: argparse.Namespace) -> None:
272322
for _ in range(args.runs)
273323
]
274324

275-
with change_log_level(logging.WARNING): # Temporarily stop logging below WARNING
276-
with cf.ProcessPoolExecutor(max_workers=args.processes) as executor:
277-
futures = [executor.submit(train_parser_model, *a) for a in arguments]
278-
eval_results = [
279-
future.result()
280-
for future in tqdm(cf.as_completed(futures), total=len(futures))
281-
]
325+
word_accuracies, sentence_accuracies, seeds, eval_results = [], [], [], []
326+
with cf.ProcessPoolExecutor(max_workers=args.processes) as executor:
327+
futures = [
328+
executor.submit(train_parser_model_bypass_logging, *a) for a in arguments
329+
]
330+
logger.info(f"Queued for {args.runs} separate runs")
331+
for idx, future in enumerate(cf.as_completed(futures)):
332+
logger.info(f"{convert_num_ordinal(idx + 1)} run completed")
333+
eval_results.append(future.result())
282334

283335
word_accuracies, sentence_accuracies, seeds = [], [], []
284336
for result in eval_results:
@@ -288,15 +340,9 @@ def train_multiple(args: argparse.Namespace) -> None:
288340

289341
sentence_mean = 100 * mean(sentence_accuracies)
290342
sentence_uncertainty = 3 * 100 * stdev(sentence_accuracies)
291-
print()
292-
print("Average sentence-level accuracy:")
293-
print(f"\t-> {sentence_mean:.2f}% ± {sentence_uncertainty:.2f}%")
294343

295344
word_mean = 100 * mean(word_accuracies)
296345
word_uncertainty = 3 * 100 * stdev(word_accuracies)
297-
print()
298-
print("Average word-level accuracy:")
299-
print(f"\t-> {word_mean:.2f}% ± {word_uncertainty:.2f}%")
300346

301347
index_best = max(
302348
range(len(sentence_accuracies)), key=sentence_accuracies.__getitem__
@@ -310,6 +356,42 @@ def train_multiple(args: argparse.Namespace) -> None:
310356
min_sent = 100 * sentence_accuracies[index_worst]
311357
min_word = 100 * word_accuracies[index_worst]
312358
min_seed = seeds[index_worst]
313-
print()
314-
print(f"Best: Sentence {max_sent:.2f}% / Word {max_word:.2f}% (Seed: {max_seed})")
315-
print(f"Worst: Sentence {min_sent:.2f}% / Word {min_word:.2f}% (Seed: {min_seed})")
359+
360+
headers = ["Run", "Word/Token accuracy", "Sentence accuracy", "Seed"]
361+
362+
table = []
363+
for idx, result in enumerate(eval_results):
364+
table.append(
365+
[
366+
convert_num_ordinal(idx + 1),
367+
f"{100 * result.token.accuracy:.2f}%",
368+
f"{100 * result.sentence.accuracy:.2f}%",
369+
f"{result.seed}",
370+
]
371+
)
372+
373+
table.append(["-"] * len(headers))
374+
table.append(
375+
[
376+
"Average",
377+
f"{word_mean:.2f}% ± {word_uncertainty:.2f}%",
378+
f"{sentence_mean:.2f}% ± {sentence_uncertainty:.2f}%",
379+
f"{max_seed}",
380+
]
381+
)
382+
table.append(["-"] * len(headers))
383+
table.append(["Best", f"{max_word:.2f}%", f"{max_sent:.2f}%", f"{max_seed}"])
384+
table.append(["Worst", f"{min_word:.2f}%", f"{min_sent:.2f}%", f"{min_seed}"])
385+
386+
print(
387+
"\n"
388+
+ tabulate(
389+
table,
390+
headers=headers,
391+
tablefmt="fancy_grid",
392+
maxcolwidths=[None, None, None, None],
393+
stralign="left",
394+
numalign="right",
395+
)
396+
+ "\n"
397+
)

train/training_utils.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from dataclasses import dataclass
88
from functools import partial
99
from itertools import chain, islice
10-
from typing import Any, Callable, Iterable
10+
from typing import Any, Callable, Iterable, Union
1111

1212
from matplotlib import pyplot as plt
1313
from sklearn.metrics import (
@@ -415,3 +415,22 @@ def confusion_matrix(
415415
fig.savefig(figure_path)
416416
logger.info(f"Confusion matrix saved to {figure_path}.")
417417
plt.close(fig)
418+
419+
420+
def convert_num_ordinal(num: Union[int, float, str]) -> str:
421+
"""Converts a number (int) into its ordinal; falls back to input if unsuccessful
422+
423+
make_ordinal(0) => '0th'
424+
make_ordinal(3) => '3rd'
425+
make_ordinal(122) => '122nd'
426+
make_ordinal(213) => '213th'
427+
"""
428+
try:
429+
n = int(num)
430+
if 11 <= (n % 100) <= 13:
431+
suffix = "th"
432+
else:
433+
suffix = ["th", "st", "nd", "rd", "th"][min(n % 10, 4)]
434+
return str(n) + suffix
435+
except TypeError or ValueError:
436+
return str(num)

0 commit comments

Comments
 (0)