Refactor: Replace print statements during training with logging

strangetom · strangetom · commit dd070517544e · 2025-05-22T17:42:32.000+01:00
diff --git a/ingredient_parser/en/preprocess.py b/ingredient_parser/en/preprocess.py
@@ -1083,7 +1083,7 @@ def _token_features(self, token: Token) -> dict[str, str | bool | int | float]:
 
         return features
 
-    def sentence_features(self) -> list[dict[str, str | bool | int | float]]:
+    def sentence_features(self) -> list[dict[str, str | bool]]:
         """Return features for all tokens in sentence.
 
         Returns
diff --git a/train.py b/train.py
@@ -2,7 +2,9 @@
 
 import argparse
 import json
+import logging
 import os
+import sys
 from random import randint
 
 from train import (
@@ -24,6 +26,12 @@ def __call__(self, parser, namespace, values, option_strings):
         setattr(namespace, self.dest, json.loads(values))
 
 
+LOGGING_LEVEL = {
+    0: logging.INFO,
+    1: logging.DEBUG,
+}
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Train a CRF model to parse label token from recipe \
@@ -85,6 +93,13 @@ def __call__(self, parser, namespace, values, option_strings):
         action="store_true",
         help="Plot confusion matrix of token labels.",
     )
+    train_parser.add_argument(
+        "-v",
+        help="Enable verbose output.",
+        action="count",
+        default=0,
+        dest="verbose",
+    )
 
     multiple_parser_help = "Average CRF performance across multiple training cycles."
     multiple_parser = subparsers.add_parser("multiple", help=multiple_parser_help)
@@ -149,6 +164,13 @@ def __call__(self, parser, namespace, values, option_strings):
         type=int,
         help="Number of processes to spawn. Default to number of cpu cores.",
     )
+    multiple_parser.add_argument(
+        "-v",
+        help="Enable verbose output.",
+        action="count",
+        default=0,
+        dest="verbose",
+    )
 
     gridsearch_parser_help = (
         "Grid search over all combinations of model hyperparameters."
@@ -255,6 +277,13 @@ def __call__(self, parser, namespace, values, option_strings):
         action=ParseJsonArg,
         default=dict(),
     )
+    gridsearch_parser.add_argument(
+        "-v",
+        help="Enable verbose output.",
+        action="count",
+        default=0,
+        dest="verbose",
+    )
 
     featuresearch_parser_help = "Grid search over all sets of model features."
     featuresearch_parser = subparsers.add_parser(
@@ -311,6 +340,13 @@ def __call__(self, parser, namespace, values, option_strings):
         type=int,
         help="Seed value used for train/test split.",
     )
+    featuresearch_parser.add_argument(
+        "-v",
+        help="Enable verbose output.",
+        action="count",
+        default=0,
+        dest="verbose",
+    )
 
     utility_help = "Utilities to aid cleaning training data."
     utility_parser = subparsers.add_parser("utility", help=utility_help)
@@ -343,6 +379,12 @@ def __call__(self, parser, namespace, values, option_strings):
 
     args = parser.parse_args()
 
+    logging.basicConfig(
+        stream=sys.stdout,
+        level=LOGGING_LEVEL[args.verbose],
+        format="[%(levelname)s] (%(module)s) %(message)s",
+    )
+
     if args.command == "train":
         train_single(args)
     elif args.command == "multiple":
diff --git a/train/__init__.py b/train/__init__.py
@@ -7,7 +7,6 @@
     "check_label_consistency",
     "feature_search",
     "grid_search",
-    "train_embeddings",
     "train_multiple",
     "train_single",
 ]
diff --git a/train/featuresearch.py b/train/featuresearch.py
@@ -2,6 +2,7 @@
 
 import argparse
 import concurrent.futures as cf
+import logging
 import os
 import time
 from datetime import timedelta
@@ -20,6 +21,8 @@
     load_datasets,
 )
 
+logger = logging.getLogger(__name__)
+
 DISCARDED_FEATURES = {
     0: [],
     1: [
@@ -195,8 +198,8 @@ def feature_search(args: argparse.Namespace):
         ]
         argument_sets.append(arguments)
 
-    print(f"[INFO] Grid search over {len(argument_sets)} feature sets.")
-    print(f"[INFO] {args.seed} is the random seed used for the train/test split.")
+    logger.info(f"Grid search over {len(argument_sets)} feature sets.")
+    logger.info(f"{args.seed} is the random seed used for the train/test split.")
 
     eval_results = []
     with cf.ProcessPoolExecutor(max_workers=args.processes) as executor:
diff --git a/train/gridsearch.py b/train/gridsearch.py
@@ -2,6 +2,7 @@
 
 import argparse
 import concurrent.futures as cf
+import logging
 import os
 import time
 from datetime import timedelta
@@ -21,6 +22,8 @@
     load_datasets,
 )
 
+logger = logging.getLogger(__name__)
+
 # Valid parameter options for LBFGS training algorithm and expected types
 VALID_LBFGS_PARAMS = {
     "c1": (float, int),
@@ -483,8 +486,8 @@ def grid_search(args: argparse.Namespace):
 
     arguments = generate_argument_sets(args)
 
-    print(f"[INFO] Grid search over {len(arguments)} hyperparameters combinations.")
-    print(f"[INFO] {args.seed} is the random seed used for the train/test split.")
+    logger.info(f"Grid search over {len(arguments)} hyperparameters combinations.")
+    logger.info(f"{args.seed} is the random seed used for the train/test split.")
 
     eval_results = []
     with cf.ProcessPoolExecutor(max_workers=args.processes) as executor:
diff --git a/train/train_model.py b/train/train_model.py
@@ -3,6 +3,7 @@
 import argparse
 import concurrent.futures as cf
 import contextlib
+import logging
 from pathlib import Path
 from random import randint
 from statistics import mean, stdev
@@ -23,6 +24,8 @@
     load_datasets,
 )
 
+logger = logging.getLogger(__name__)
+
 
 def train_parser_model(
     vectors: DataVectors,
@@ -69,7 +72,7 @@ def train_parser_model(
     if seed is None:
         seed = randint(0, 1_000_000_000)
 
-    print(f"[INFO] {seed} is the random seed used for the train/test split.")
+    logger.info(f"{seed} is the random seed used for the train/test split.")
 
     # Split data into train and test sets
     # The stratify argument means that each dataset is represented proprtionally
@@ -96,10 +99,10 @@ def train_parser_model(
         stratify=vectors.source,
         random_state=seed,
     )
-    print(f"[INFO] {len(features_train):,} training vectors.")
-    print(f"[INFO] {len(features_test):,} testing vectors.")
+    logger.info(f"{len(features_train):,} training vectors.")
+    logger.info(f"{len(features_test):,} testing vectors.")
 
-    print("[INFO] Training model with training data.")
+    logger.info("Training model with training data.")
     trainer = pycrfsuite.Trainer(verbose=False)  # type: ignore
     trainer.set_params(
         {
@@ -117,7 +120,7 @@ def train_parser_model(
         trainer.append(X, y)
     trainer.train(str(save_model))
 
-    print("[INFO] Evaluating model with test data.")
+    logger.info("Evaluating model with test data.")
     tagger = pycrfsuite.Tagger()  # type: ignore
     tagger.open(str(save_model))
 
diff --git a/train/training_utils.py b/train/training_utils.py
@@ -2,6 +2,7 @@
 
 import concurrent.futures as cf
 import json
+import logging
 import sqlite3
 from dataclasses import dataclass
 from functools import partial
@@ -17,6 +18,8 @@
 
 from ingredient_parser import SUPPORTED_LANGUAGES
 
+logger = logging.getLogger(__name__)
+
 sqlite3.register_converter("json", json.loads)
 
 DEFAULT_MODEL_LOCATION = "ingredient_parser/en/model.en.crfsuite"
@@ -185,7 +188,7 @@ def load_datasets(
     """
     PreProcessor = select_preprocessor(table)
 
-    print("[INFO] Loading and transforming training data.")
+    logger.info("Loading and transforming training data.")
 
     n = len(datasets)
     with sqlite3.connect(database, detect_types=sqlite3.PARSE_DECLTYPES) as conn:
@@ -225,8 +228,8 @@ def load_datasets(
         discarded=sum(v.discarded for v in vectors),
     )
 
-    print(f"[INFO] {len(all_vectors.sentences):,} usable vectors.")
-    print(f"[INFO] {all_vectors.discarded:,} discarded due to OTHER labels.")
+    logger.info(f"{len(all_vectors.sentences):,} usable vectors.")
+    logger.info(f"{all_vectors.discarded:,} discarded due to OTHER labels.")
     return all_vectors
 
 
@@ -378,5 +381,5 @@ def confusion_matrix(
     ax.tick_params(axis="x", labelrotation=45)
     fig.tight_layout()
     fig.savefig(figure_path)
-    print(f"[INFO] Confusion matrix saved to {figure_path}")
+    logger.info(f"Confusion matrix saved to {figure_path}.")
     plt.close(fig)

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,6 @@`
`7`	`7`	`"check_label_consistency",`
`8`	`8`	`"feature_search",`
`9`	`9`	`"grid_search",`
`10`		`- "train_embeddings",`
`11`	`10`	`"train_multiple",`
`12`	`11`	`"train_single",`
`13`	`12`	`]`