upload-notes: support sublabels in --label-by-* sources

mikix · mikix · commit 9d078357290b · 2025-09-08T13:57:45.000-04:00
This looks for two new fields in CSVs/Athena tables: sublabel_name and
sublabel_value. If both are provided, we send Label Studio enough info
to fill out sublabels, when the configuration has them.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -46,9 +46,10 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install dotnet8
+          sed -i 's/;net9.0</</' mstool/Directory.Build.props  # disable net9.0, it confuses SDK 8.0
           dotnet publish \
+            --framework=net8.0 \
             --runtime=linux-x64 \
-            --configuration=Release \
             -p:PublishSingleFile=true \
             --output=$HOME/.local/bin \
             mstool/FHIR/src/Microsoft.Health.Fhir.Anonymizer.R4.CommandLineTool
diff --git a/Dockerfile b/Dockerfile
@@ -7,11 +7,12 @@ FROM mcr.microsoft.com/dotnet/sdk:8.0 AS ms-tool
 COPY --from=ms-tool-src /app /app
 # This will force builds to fail if the environment piping breaks for some reason
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+RUN sed -i 's/;net9.0</</' /app/Directory.Build.props  # disable net9.0, it confuses SDK 8.0
 RUN arch=$(arch | sed s/aarch64/arm64/ | sed s/x86_64/x64/) && \
   dotnet publish \
+  --framework=net8.0 \
   --runtime=linux-${arch} \
   --self-contained=true \
-  --configuration=Release \
   -p:InvariantGlobalization=true \
   -p:PublishSingleFile=true \
   --output=/bin \
diff --git a/cumulus_etl/errors.py b/cumulus_etl/errors.py
@@ -47,6 +47,8 @@
 MULTIPLE_COHORT_ARGS = 47
 COHORT_NOT_FOUND = 48
 MULTIPLE_LABELING_ARGS = 49
+LABEL_UNKNOWN = 50
+LABEL_CONFIG_TYPE_UNKNOWN = 51
 
 
 class FatalError(Exception):
diff --git a/cumulus_etl/upload_notes/cli.py b/cumulus_etl/upload_notes/cli.py
@@ -2,6 +2,7 @@
 
 import argparse
 import asyncio
+import dataclasses
 import datetime
 import sys
 from collections.abc import Callable, Collection
@@ -247,7 +248,7 @@ def group_notes_by_unique_id(notes: Collection[LabelStudioNote]) -> list[LabelSt
     for unique_id, group_notes in by_unique_id.items():
         grouped_text = ""
         grouped_ctakes_matches = []
-        grouped_highlights = {}
+        grouped_highlights = []
         grouped_philter_map = {}
         grouped_doc_mappings = {}
         grouped_doc_spans = {}
@@ -283,14 +284,10 @@ def group_notes_by_unique_id(notes: Collection[LabelStudioNote]) -> list[LabelSt
                 match.end += offset
                 grouped_ctakes_matches.append(match)
 
-            for source, labels in note.highlights.items():
-                grouped_labels = grouped_highlights.setdefault(source, {})
-                for label, spans in labels.items():
-                    for span in spans:
-                        new_span = ctakesclient.typesystem.Span(
-                            span.begin + offset, span.end + offset
-                        )
-                        grouped_labels.setdefault(label, []).append(new_span)
+            for highlight in note.highlights:
+                span = highlight.span
+                new_span = (span[0] + offset, span[1] + offset)
+                grouped_highlights.append(dataclasses.replace(highlight, span=new_span))
 
             for start, stop in note.philter_map.items():
                 grouped_philter_map[start + offset] = stop + offset
diff --git a/cumulus_etl/upload_notes/labeling.py b/cumulus_etl/upload_notes/labeling.py
@@ -3,8 +3,6 @@
 import argparse
 from collections.abc import Collection
 
-import ctakesclient
-
 from cumulus_etl import cli_utils, common, deid, errors, nlp
 from cumulus_etl.upload_notes import labelstudio
 
@@ -56,7 +54,17 @@ def _label_by_csv(
     *,
     is_anon: bool,
 ) -> None:
-    matcher = nlp.CsvMatcher(csv_file, is_anon=is_anon, extra_fields=["label", "span", "origin"])
+    matcher = nlp.CsvMatcher(
+        csv_file,
+        is_anon=is_anon,
+        extra_fields=[
+            "label",
+            "span",
+            "sublabel_name",
+            "sublabel_value",
+            "origin",
+        ],
+    )
 
     for note in notes:
         for ref, doc_span in note.doc_spans.items():
@@ -65,15 +73,23 @@ def _label_by_csv(
                 for match in sorted(matches):
                     label = match[0]
                     span = match[1]
-                    origin = match[2] or DEFAULT_ORIGIN
+                    sublabel_name = match[2] or None
+                    sublabel_value = match[3] or None
+                    origin = match[4] or DEFAULT_ORIGIN
                     if "__" in origin:  # if it looks like a table name, chop it down
                         origin = origin.split("__", 1)[-1].removeprefix("nlp_")
                     if label and span and ":" in span:
                         begin, end = span.split(":", 1)
-                        span = ctakesclient.typesystem.Span(
-                            int(begin) + doc_span[0], int(end) + doc_span[0]
+                        span = (int(begin) + doc_span[0], int(end) + doc_span[0])
+                        note.highlights.append(
+                            labelstudio.Highlight(
+                                label,
+                                span,
+                                origin=origin,
+                                sublabel_name=sublabel_name,
+                                sublabel_value=sublabel_value,
+                            )
                         )
-                        note.highlights.setdefault(origin, {}).setdefault(label, []).append(span)
 
 
 def _highlight_words(
@@ -91,8 +107,12 @@ def _highlight_words(
     for note in notes:
         for pattern in patterns:
             for match in pattern.finditer(note.text):
-                # Look at group 2 (the middle term group, ignoring the edge groups)
-                span = ctakesclient.typesystem.Span(match.start(2), match.end(2))
-                labels = note.highlights.setdefault(DEFAULT_ORIGIN, {})
-                # We use a generic default label to cause Label Studio to highlight it
-                labels.setdefault(DEFAULT_LABEL, []).append(span)
+                note.highlights.append(
+                    labelstudio.Highlight(
+                        # We use a generic default label to cause Label Studio to highlight it
+                        label=DEFAULT_LABEL,
+                        # Look at group 2 (the middle term group, ignoring the edge groups)
+                        span=(match.start(2), match.end(2)),
+                        origin=DEFAULT_ORIGIN,
+                    )
+                )
diff --git a/cumulus_etl/upload_notes/labelstudio.py b/cumulus_etl/upload_notes/labelstudio.py
@@ -2,6 +2,7 @@
 
 import dataclasses
 import datetime
+import hashlib
 import math
 from collections.abc import AsyncIterator, Collection, Iterable
 
@@ -18,6 +19,17 @@
 ###############################################################################
 
 
+@dataclasses.dataclass
+class Highlight:
+    """Describes a label, a span, and some extra metadata"""
+
+    label: str
+    span: tuple[int, int]
+    origin: str
+    sublabel_name: str | None = None
+    sublabel_value: str | None = None
+
+
 @dataclasses.dataclass
 class LabelStudioNote:
     """Holds all the data that Label Studio will need for a single note (or a single grouped encounter note)"""
@@ -45,10 +57,8 @@ class LabelStudioNote:
         default_factory=list
     )
 
-    # Matches found by word search or csv, as a dict of origins -> labels -> found spans
-    highlights: dict[str, dict[str | None, list[ctakesclient.typesystem.Span]]] = dataclasses.field(
-        default_factory=dict
-    )
+    # Matches found by word search or csv
+    highlights: list[Highlight] = dataclasses.field(default_factory=list)
 
     # Matches found by Philter
     philter_map: dict[int, int] = dataclasses.field(default_factory=dict)
@@ -167,19 +177,49 @@ def _format_task_for_note(self, note: LabelStudioNote) -> dict:
 
         return task
 
-    def _format_match(self, begin: int, end: int, text: str, labels: Iterable[str]) -> dict:
-        return {
-            "from_name": self._labels_name,
-            "to_name": self._labels_config["to_name"][0],
-            "type": "labels",
+    def _format_match(
+        self,
+        begin: int,
+        end: int,
+        text: str,
+        labels: Iterable[str],
+        from_name: str | None = None,
+        label_id: str | None = None,
+    ) -> dict:
+        from_name = from_name or self._labels_name
+        config = self._project.parsed_label_config.get(from_name)
+        if not config:
+            errors.fatal(f"Unrecognized label name '{from_name}'.", errors.LABEL_UNKNOWN)
+
+        match = {
+            "from_name": from_name,
+            "to_name": config["to_name"][0],
+            "type": config["type"].casefold(),
             "value": {
                 "start": begin,
                 "end": end,
                 "score": 1.0,
                 "text": text,
-                "labels": list(labels),
             },
         }
+        if label_id:
+            match["id"] = label_id
+
+        match config["type"].casefold():
+            case "labels":
+                field = "labels"
+            case "choices":
+                field = "choices"
+            case "textarea":
+                field = "text"
+            case _:
+                errors.fatal(
+                    f"Unrecognized Label Studio config type '{config['type']}'.",
+                    errors.LABEL_CONFIG_TYPE_UNKNOWN,
+                )
+
+        match["value"][field] = list(labels)
+        return match
 
     def _format_ctakes_predictions(self, task: dict, note: LabelStudioNote) -> None:
         if not note.ctakes_matches:
@@ -208,20 +248,46 @@ def _format_ctakes_predictions(self, task: dict, note: LabelStudioNote) -> None:
         self._update_used_labels(task, used_labels)
 
     def _format_highlights_predictions(self, task: dict, note: LabelStudioNote) -> None:
-        for source, labels in note.highlights.items():
-            prediction = {"model_version": source}
-            results = []
-            for label, spans in labels.items():
-                for span in spans:
-                    results.append(
-                        self._format_match(
-                            span.begin, span.end, note.text[span.begin : span.end], [label]
-                        )
+        # Group up the highlights by parent label.
+        # Then we'll see how many sublabels it has.
+        grouped_highlights = {}  # key-tuple -> sublabel name -> sublabel value list
+        for highlight in note.highlights:
+            key = (highlight.label, highlight.span, highlight.origin)
+            sublabels = grouped_highlights.setdefault(key, {})
+            sublabels.setdefault(highlight.sublabel_name, []).append(highlight.sublabel_value)
+
+        predictions = {}  # dict of origin -> prediction dict
+        for key, sublabels in grouped_highlights.items():
+            label, span, origin = key
+            default_prediction = {"model_version": origin, "result": []}
+            prediction = predictions.setdefault(origin, default_prediction)
+
+            label_id = "__".join(str(k) for k in key)
+            label_id = hashlib.md5(label_id.encode(), usedforsecurity=False).hexdigest()
+            text = note.text[span[0] : span[1]]
+
+            # First, add the parent label
+            prediction["result"].append(
+                self._format_match(span[0], span[1], text, [label], label_id=label_id)
+            )
+
+            # Now add sublabels
+            for sublabel_name, sublabel_values in sublabels.items():
+                if not sublabel_name:
+                    continue
+                prediction["result"].append(
+                    self._format_match(
+                        span[0],
+                        span[1],
+                        text,
+                        sublabel_values,
+                        label_id=label_id,
+                        from_name=sublabel_name,
                     )
-            prediction["result"] = results
-            task["predictions"].append(prediction)
+                )
 
-            self._update_used_labels(task, labels.keys())
+        task["predictions"].extend(predictions.values())
+        self._update_used_labels(task, {x.label for x in note.highlights})
 
     def _format_philter_predictions(self, task: dict, note: LabelStudioNote) -> None:
         """
diff --git a/tests/upload_notes/test_upload_cli.py b/tests/upload_notes/test_upload_cli.py
diff --git a/tests/upload_notes/test_upload_labelstudio.py b/tests/upload_notes/test_upload_labelstudio.py