Feature: Add training option to combine all NAME_* labels into a single NAME label

strangetom · strangetom · commit 58ff3486f061 · 2025-07-19T13:47:57.000+01:00
diff --git a/train.py b/train.py
@@ -93,6 +93,11 @@ def __call__(self, parser, namespace, values, option_strings):
         action="store_true",
         help="Plot confusion matrix of token labels.",
     )
+    train_parser.add_argument(
+        "--combine-name-labels",
+        action="store_true",
+        help="Combine labels containing 'NAME' into a single NAME label.",
+    )
     train_parser.add_argument(
         "-v",
         help="Enable verbose output.",
@@ -150,6 +155,11 @@ def __call__(self, parser, namespace, values, option_strings):
         action="store_true",
         help="Plot confusion matrix of token labels.",
     )
+    multiple_parser.add_argument(
+        "--combine-name-labels",
+        action="store_true",
+        help="Combine labels containing 'NAME' into a single NAME label.",
+    )
     multiple_parser.add_argument(
         "-r",
         "--runs",
@@ -227,6 +237,11 @@ def __call__(self, parser, namespace, values, option_strings):
         type=int,
         help="Seed value used for train/test split.",
     )
+    gridsearch_parser.add_argument(
+        "--combine-name-labels",
+        action="store_true",
+        help="Combine labels containing 'NAME' into a single NAME label.",
+    )
     gridsearch_parser.add_argument(
         "--algos",
         default=["lbfgs"],
@@ -327,6 +342,11 @@ def __call__(self, parser, namespace, values, option_strings):
         default=False,
         help="Keep models after evaluation instead of deleting.",
     )
+    featuresearch_parser.add_argument(
+        "--combine-name-labels",
+        action="store_true",
+        help="Combine labels containing 'NAME' into a single NAME label.",
+    )
     featuresearch_parser.add_argument(
         "-p",
         "--processes",
diff --git a/train/featuresearch.py b/train/featuresearch.py
@@ -74,6 +74,7 @@ def train_model_feature_search(
     save_model: str,
     seed: int,
     keep_model: bool,
+    combine_name_labels: bool,
 ) -> dict:
     """Train model using selected features returning model performance statistics,
     model parameters and elapsed training time.
@@ -93,6 +94,8 @@ def train_model_feature_search(
         testing sets.
     keep_model : bool
         If True, keep model after evaluation, otherwise delete it.
+    combine_name_labels : bool, optional
+        If True, combine all NAME labels into a single NAME label.
 
     Returns
     -------
@@ -157,7 +160,7 @@ def train_model_feature_search(
     tagger = pycrfsuite.Tagger()  # type: ignore
     tagger.open(str(save_model_path))
     labels_pred = [tagger.tag(X) for X in features_test]
-    stats = evaluate(labels_pred, truth_test, seed)
+    stats = evaluate(labels_pred, truth_test, seed, combine_name_labels)
 
     if not keep_model:
         save_model_path.unlink(missing_ok=True)
@@ -179,7 +182,13 @@ def feature_search(args: argparse.Namespace):
     args : argparse.Namespace
         Feature search configuration
     """
-    vectors = load_datasets(args.database, args.table, args.datasets)
+    vectors = load_datasets(
+        args.database,
+        args.table,
+        args.datasets,
+        discard_other=True,
+        combine_name_labels=args.combine_name_labels,
+    )
 
     if args.save_model is None:
         save_model = DEFAULT_MODEL_LOCATION
@@ -195,6 +204,7 @@ def feature_search(args: argparse.Namespace):
             save_model,
             args.seed,
             args.keep_models,
+            args.combine_name_labels,
         ]
         argument_sets.append(arguments)
 
diff --git a/train/gridsearch.py b/train/gridsearch.py
@@ -321,7 +321,13 @@ def generate_argument_sets(args: argparse.Namespace) -> list[list]:
         list of lists, where each sublist is the arguments for training a model with
         one of the combinations of algorithms and parameters
     """
-    vectors = load_datasets(args.database, args.table, args.datasets)
+    vectors = load_datasets(
+        args.database,
+        args.table,
+        args.datasets,
+        discard_other=True,
+        combine_name_labels=args.combine_name_labels,
+    )
 
     # Generate list of arguments for all combinations parameters for each algorithm
     argument_sets = []
@@ -360,6 +366,7 @@ def generate_argument_sets(args: argparse.Namespace) -> list[list]:
                 save_model,
                 args.seed,
                 args.keep_models,
+                args.combine_name_labels,
             ]
             argument_sets.append(arguments)
 
@@ -374,6 +381,7 @@ def train_model_grid_search(
     save_model: str,
     seed: int,
     keep_model: bool,
+    combine_name_labels: bool,
 ) -> dict:
     """Train model using given training algorithm and parameters,
     returning model performance statistics, model parameters and elapsed training time.
@@ -395,6 +403,8 @@ def train_model_grid_search(
         testing sets.
     keep_model : bool
         If True, keep model after evaluation, otherwise delete it.
+    combine_name_labels : bool, optional
+        If True, combine all NAME labels into a single NAME label.
 
     Returns
     -------
@@ -443,7 +453,7 @@ def train_model_grid_search(
     tagger = pycrfsuite.Tagger()  # type: ignore
     tagger.open(str(save_model_path))
     labels_pred = [tagger.tag(X) for X in features_test]
-    stats = evaluate(labels_pred, truth_test, seed)
+    stats = evaluate(labels_pred, truth_test, seed, combine_name_labels)
 
     if not keep_model:
         save_model_path.unlink(missing_ok=True)
diff --git a/train/train_model.py b/train/train_model.py
@@ -36,6 +36,7 @@ def train_parser_model(
     detailed_results: bool,
     plot_confusion_matrix: bool,
     keep_model: bool = True,
+    combine_name_labels: bool = False,
 ) -> Stats:
     """Train model using vectors, splitting the vectors into a train and evaluation
     set based on <split>. The trained model is saved to <save_model>.
@@ -59,9 +60,12 @@ def train_parser_model(
         the test set.
     plot_confusion_matrix : bool
         If True, plot a confusion matrix of the token labels.
-    kee[_model : bool, optional
+    keep_model : bool, optional
         If False, delete model from disk after evaluating it's performance.
         Default is True.
+    combine_name_labels : bool, optional
+        If True, combine all NAME labels into a single NAME label.
+        Default is False
 
     Returns
     -------
@@ -154,7 +158,7 @@ def train_parser_model(
     if plot_confusion_matrix:
         confusion_matrix(labels_pred, truth_test)
 
-    stats = evaluate(labels_pred, truth_test, seed)
+    stats = evaluate(labels_pred, truth_test, seed, combine_name_labels)
 
     if not keep_model:
         save_model.unlink(missing_ok=True)
@@ -170,7 +174,13 @@ def train_single(args: argparse.Namespace) -> None:
     args : argparse.Namespace
         Model training configuration
     """
-    vectors = load_datasets(args.database, args.table, args.datasets)
+    vectors = load_datasets(
+        args.database,
+        args.table,
+        args.datasets,
+        discard_other=True,
+        combine_name_labels=args.combine_name_labels,
+    )
 
     if args.save_model is None:
         save_model = DEFAULT_MODEL_LOCATION
@@ -186,6 +196,7 @@ def train_single(args: argparse.Namespace) -> None:
         args.detailed,
         args.confusion,
         keep_model=True,
+        combine_name_labels=args.combine_name_labels,
     )
 
     print("Sentence-level results:")
@@ -208,7 +219,13 @@ def train_multiple(args: argparse.Namespace) -> None:
     args : argparse.Namespace
         Model training configuration
     """
-    vectors = load_datasets(args.database, args.table, args.datasets)
+    vectors = load_datasets(
+        args.database,
+        args.table,
+        args.datasets,
+        discard_other=True,
+        combine_name_labels=args.combine_name_labels,
+    )
 
     if args.save_model is None:
         save_model = DEFAULT_MODEL_LOCATION
@@ -227,6 +244,7 @@ def train_multiple(args: argparse.Namespace) -> None:
             args.detailed,
             args.confusion,
             False,  # keep_model
+            args.combine_name_labels,
         )
         for _ in range(args.runs)
     ]
diff --git a/train/training_utils.py b/train/training_utils.py
@@ -70,11 +70,17 @@ class TokenStats:
 
 
 @dataclass
-class FFTokenStats:
+class TokenStatsCombinedName:
     """Statistics for token classification performance."""
 
-    FF: Metrics
-    NF: Metrics
+    NAME: Metrics
+    QTY: Metrics
+    UNIT: Metrics
+    SIZE: Metrics
+    COMMENT: Metrics
+    PURPOSE: Metrics
+    PREP: Metrics
+    PUNC: Metrics
     macro_avg: Metrics
     weighted_avg: Metrics
     accuracy: float
@@ -91,7 +97,7 @@ class SentenceStats:
 class Stats:
     """Statistics for token and sentence classification performance."""
 
-    token: TokenStats | FFTokenStats
+    token: TokenStats | TokenStatsCombinedName
     sentence: SentenceStats
     seed: int
 
@@ -161,6 +167,7 @@ def load_datasets(
     table: str,
     datasets: list[str],
     discard_other: bool = True,
+    combine_name_labels: bool = False,
 ) -> DataVectors:
     """Load raw data from csv files and transform into format required for training.
 
@@ -176,6 +183,8 @@ def load_datasets(
         Default is PARSER.
     discard_other : bool, optional
         If True, discard sentences containing tokens with OTHER label
+    combine_name_labels :  bool, optional
+        If True, combine all labels containing "NAME" into a single "NAME" label
 
     Returns
     -------
@@ -216,6 +225,7 @@ def load_datasets(
                 chunks,
                 [PreProcessor] * n_chunks,
                 [discard_other] * n_chunks,
+                [combine_name_labels] * n_chunks,
             )
         ]
 
@@ -235,7 +245,10 @@ def load_datasets(
 
 
 def process_sentences(
-    data: list[dict], PreProcessor: Callable, discard_other: bool
+    data: list[dict],
+    PreProcessor: Callable,
+    discard_other: bool,
+    combine_name_labels: bool,
 ) -> DataVectors:
     """Process training sentences from database into format needed for training and
     evaluation.
@@ -247,7 +260,9 @@ def process_sentences(
     PreProcessor : Callable
         PreProcessor class to preprocess sentences.
     discard_other : bool
-        If True, discard sentences with OTHER label
+        If True, discard sentences with OTHER
+    combine_name_labels : bool
+        If True, combine all labels containing "NAME" into a single "NAME" label
 
     Returns
     -------
@@ -278,7 +293,17 @@ def process_sentences(
         uids.append(entry["id"])
         features.append(p.sentence_features())
         tokens.append([t.text for t in p.tokenized_sentence])
-        labels.append(entry["labels"])
+
+        if combine_name_labels:
+            new_labels = []
+            for label in entry["labels"]:
+                if "NAME" in label:
+                    new_labels.append("NAME")
+                else:
+                    new_labels.append(label)
+            labels.append(new_labels)
+        else:
+            labels.append(entry["labels"])
 
         # Ensure length of tokens and length of labels are the same
         if len(p.tokenized_sentence) != len(entry["labels"]):
@@ -297,6 +322,7 @@ def evaluate(
     predictions: list[list[str]],
     truths: list[list[str]],
     seed: int,
+    combine_name_labels: bool,
 ) -> Stats:
     """Calculate statistics on the predicted labels for the test data.
 
@@ -308,6 +334,8 @@ def evaluate(
         True labels for each test sentence
     seed : int
         Seed value that produced the results
+    combine_name_labels : bool
+        If True, all NAME labels are combined into a single NAME label
 
     Returns
     -------
@@ -338,7 +366,10 @@ def evaluate(
             )
 
     token_stats["accuracy"] = accuracy_score(flat_truths, flat_predictions)
-    token_stats = TokenStats(**token_stats)
+    if combine_name_labels:
+        token_stats = TokenStatsCombinedName(**token_stats)
+    else:
+        token_stats = TokenStats(**token_stats)
 
     # Generate sentence statistics
     # The only statistics that makes sense here is accuracy because there are only