feat: have NLP tasks read in DxReports as well as DocRefs

mikix · mikix · commit 7a26b11f39bf · 2025-08-20T12:13:20.000-04:00
This commit adds support for dual-resource tasks and then adds
DiagnosticReport to the NLP base task class.

This required some vocabulary alignment, as we used "docref" a lot
in places that now can take a docref or a dxreport.
- "note" or "note resource": a DocRef or DxReport resource (dict)
- "note text" or "text": the clinical text inside the note
diff --git a/cumulus_etl/etl/nlp/cli.py b/cumulus_etl/etl/nlp/cli.py
@@ -112,10 +112,9 @@ def get_cohort_filter(args: argparse.Namespace) -> Callable[[deid.Codebook, dict
 
     def res_filter(codebook: deid.Codebook, resource: dict) -> bool:
         match resource["resourceType"]:
-            # TODO: uncomment once we support DxReport NLP (coming soon)
-            # case "DiagnosticReport":
-            #     id_pool = dxreport_ids
-            #     patient_ref = resource.get("subject", {}).get("reference")
+            case "DiagnosticReport":
+                id_pool = dxreport_ids
+                patient_ref = resource.get("subject", {}).get("reference")
             case "DocumentReference":
                 id_pool = docref_ids
                 patient_ref = resource.get("subject", {}).get("reference")
diff --git a/cumulus_etl/etl/pipeline.py b/cumulus_etl/etl/pipeline.py
@@ -120,6 +120,7 @@ async def check_available_resources(
     requested_resources: set[str],
     args: argparse.Namespace,
     is_default_tasks: bool,
+    nlp: bool,
 ) -> set[str]:
     # Here we try to reconcile which resources the user requested and which resources are actually
     # available in the input root.
@@ -138,25 +139,28 @@ async def check_available_resources(
     if detected is None:
         return requested_resources  # likely we haven't run bulk export yet
 
-    if missing_resources := requested_resources - detected:
+    missing_resources = requested_resources - detected
+    available_resources = requested_resources & detected
+
+    if nlp and available_resources:
+        # As long as there is any resource for NLP to read from, we'll take it
+        return available_resources
+
+    if missing_resources:
         for resource in sorted(missing_resources):
             # Log the same message we would print if in common.py if we ran tasks anyway
             logging.warning("No %s files found in %s", resource, loader.root.path)
 
         if is_default_tasks:
-            requested_resources -= missing_resources  # scope down to detected resources
-            if not requested_resources:
-                errors.fatal(
-                    "No supported resources found.",
-                    errors.MISSING_REQUESTED_RESOURCES,
-                )
+            if not available_resources:
+                errors.fatal("No supported resources found.", errors.MISSING_REQUESTED_RESOURCES)
         else:
             msg = "Required resources not found.\n"
             if has_allow_missing:
                 msg += "Add --allow-missing-resources to run related tasks anyway with no input."
             errors.fatal(msg, errors.MISSING_REQUESTED_RESOURCES)
 
-    return requested_resources
+    return available_resources
 
 
 async def run_pipeline(
@@ -192,7 +196,7 @@ async def run_pipeline(
             await task.init_check()
 
     # Grab a list of all required resource types for the tasks we are running
-    required_resources = set(t.resource for t in selected_tasks)
+    required_resources = set().union(*(t.get_resource_types() for t in selected_tasks))
 
     # Create a client to talk to a FHIR server.
     # This is useful even if we aren't doing a bulk export, because some resources like
@@ -214,9 +218,10 @@ async def run_pipeline(
             args=args,
             is_default_tasks=is_default_tasks,
             requested_resources=required_resources,
+            nlp=nlp,
         )
         # Drop any tasks that we didn't find resources for
-        selected_tasks = [t for t in selected_tasks if t.resource in required_resources]
+        selected_tasks = [t for t in selected_tasks if t.get_resource_types() & required_resources]
 
         # Load resources from a remote location (like s3), convert from i2b2, or do a bulk export
         loader_results = await config_loader.load_resources(required_resources)
diff --git a/cumulus_etl/etl/studies/covid_symptom/covid_ctakes.py b/cumulus_etl/etl/studies/covid_symptom/covid_ctakes.py
@@ -32,7 +32,7 @@ async def covid_symptoms_extract(
     :return: list of NLP results encoded as FHIR observations
     """
     try:
-        docref_id, encounter_id, subject_id = nlp.get_docref_info(docref)
+        note_ref, encounter_id, subject_id = nlp.get_note_info(docref)
     except KeyError as exc:
         logging.warning(exc)
         return None
@@ -62,7 +62,7 @@ async def covid_symptoms_extract(
         )
     except Exception as exc:
         logging.warning(
-            "Could not extract symptoms for docref %s (%s): %s", docref_id, type(exc).__name__, exc
+            "Could not extract symptoms for %s (%s): %s", note_ref, type(exc).__name__, exc
         )
         return None
 
@@ -95,10 +95,13 @@ def is_covid_match(m: ctakesclient.typesystem.MatchText):
         )
     except Exception as exc:
         logging.warning(
-            "Could not check polarity for docref %s (%s): %s", docref_id, type(exc).__name__, exc
+            "Could not check polarity for %s (%s): %s", note_ref, type(exc).__name__, exc
         )
         return None
 
+    # We only look at docrefs - get just the ID for use in the symptom fields
+    docref_id = note_ref.removeprefix("DocumentReference/")
+
     # Helper to make a single row (match_value is None if there were no found symptoms at all)
     def _make_covid_symptom_row(row_id: str, match: dict | None) -> dict:
         return {
diff --git a/cumulus_etl/etl/studies/covid_symptom/covid_tasks.py b/cumulus_etl/etl/studies/covid_symptom/covid_tasks.py
@@ -65,8 +65,11 @@ def is_ed_coding(coding):
     return coding.get("code") in ED_CODES.get(coding.get("system"), {})
 
 
-def is_ed_docref(docref):
+def is_ed_docref(docref) -> bool:
     """Returns true if this is a coding for an emergency department note"""
+    if docref["resourceType"] != "DocumentReference":
+        return False
+
     # We check both type and category for safety -- we aren't sure yet how EHRs are using these fields.
     codings = list(
         itertools.chain.from_iterable([cat.get("coding", []) for cat in docref.get("category", [])])
diff --git a/cumulus_etl/etl/tasks/base.py b/cumulus_etl/etl/tasks/base.py
@@ -89,7 +89,7 @@ class EtlTask:
     # Properties:
     name: ClassVar[str] = None  # task & table name
     # incoming resource that this task operates on (will be included in bulk exports etc)
-    resource: ClassVar[str] = None
+    resource: ClassVar[str | set[str]] = None
     tags: ClassVar[set[str]] = []
     # whether this task needs bulk MS tool de-id run on its inputs (NLP tasks usually don't)
     needs_bulk_deid: ClassVar[bool] = True
@@ -378,10 +378,11 @@ def read_ndjson(
 
         If `resources` is provided, those resources will be read (in the provided order).
         That is, ["Condition", "Encounter"] will first read all Conditions, then all Encounters.
-        If `resources` is not provided, the task's main resource (self.resource) will be used.
+        If `resources` is not provided, the task's main resources (via self.get_resource_types())
+        will be used.
         """
         input_root = store.Root(self.task_config.dir_input)
-        resources = resources or [self.resource]
+        resources = resources or sorted(self.get_resource_types())
 
         if progress:
             # Make new task to track processing of rows
@@ -472,3 +473,10 @@ def get_schema(cls, resource_type: str | None, rows: list[dict]) -> pyarrow.Sche
         if resource_type:
             return cfs.pyarrow_schema_from_rows(resource_type, rows)
         return None
+
+    @classmethod
+    def get_resource_types(cls) -> set[str]:
+        """Abstracts whether the class's resource field is a str or a set of strings."""
+        if isinstance(cls.resource, str):
+            return {cls.resource}
+        return set(cls.resource)
diff --git a/cumulus_etl/etl/tasks/nlp_task.py b/cumulus_etl/etl/tasks/nlp_task.py
@@ -27,7 +27,7 @@
 class BaseNlpTask(tasks.EtlTask):
     """Base class for any clinical-notes-based NLP task."""
 
-    resource: ClassVar = "DocumentReference"
+    resource: ClassVar = {"DiagnosticReport", "DocumentReference"}
     needs_bulk_deid: ClassVar = False
 
     # You may want to override these in your subclass
@@ -80,44 +80,45 @@ async def read_notes(
         """
         Iterate through clinical notes.
 
-        :returns: a tuple of original-docref, scrubbed-docref, and clinical note
+        :returns: a tuple of original-resource, scrubbed-resource, and note text
         """
         warned_connection_error = False
 
-        note_filter = self.task_config.resource_filter or nlp.is_docref_valid
+        note_filter = self.task_config.resource_filter or nlp.is_note_valid
 
-        for docref in self.read_ndjson(progress=progress):
-            orig_docref = copy.deepcopy(docref)
+        for note in self.read_ndjson(progress=progress):
+            orig_note = copy.deepcopy(note)
             can_process = (
-                note_filter(self.scrubber.codebook, docref)
-                and (doc_check is None or doc_check(docref))
-                and self.scrubber.scrub_resource(docref, scrub_attachments=False, keep_stats=False)
+                note_filter(self.scrubber.codebook, note)
+                and (doc_check is None or doc_check(note))
+                and self.scrubber.scrub_resource(note, scrub_attachments=False, keep_stats=False)
             )
             if not can_process:
                 continue
 
             try:
-                clinical_note = await fhir.get_clinical_note(self.task_config.client, docref)
+                note_text = await fhir.get_clinical_note(self.task_config.client, note)
             except cfs.BadAuthArguments as exc:
                 if not warned_connection_error:
                     # Only warn user about a misconfiguration once per task.
                     # It's not fatal because it might be intentional (partially inlined DocRefs
                     # and the other DocRefs are known failures - BCH hits this with Cerner data).
                     print(exc, file=sys.stderr)
                     warned_connection_error = True
-                self.add_error(orig_docref)
+                self.add_error(orig_note)
                 continue
             except Exception as exc:
-                logging.warning("Error getting text for docref %s: %s", docref["id"], exc)
-                self.add_error(orig_docref)
+                orig_note_ref = f"{orig_note['resourceType']}/{orig_note['id']}"
+                logging.warning("Error getting text for note %s: %s", orig_note_ref, exc)
+                self.add_error(orig_note)
                 continue
 
-            yield orig_docref, docref, clinical_note
+            yield orig_note, note, note_text
 
     @staticmethod
-    def remove_trailing_whitespace(note: str) -> str:
+    def remove_trailing_whitespace(note_text: str) -> str:
         """Sometimes NLP can be mildly confused by trailing whitespace, so this removes it"""
-        return TRAILING_WHITESPACE.sub("", note)
+        return TRAILING_WHITESPACE.sub("", note_text)
 
 
 class BaseOpenAiTask(BaseNlpTask):
@@ -139,59 +140,52 @@ async def init_check(cls) -> None:
     async def read_entries(self, *, progress: rich.progress.Progress = None) -> tasks.EntryIterator:
         client = self.client_class()
 
-        async for orig_docref, docref, orig_clinical_note in self.read_notes(progress=progress):
+        async for orig_note, note, orig_note_text in self.read_notes(progress=progress):
             try:
-                docref_id, encounter_id, subject_id = nlp.get_docref_info(docref)
+                note_ref, encounter_id, subject_id = nlp.get_note_info(note)
             except KeyError as exc:
                 logging.warning(exc)
-                self.add_error(orig_docref)
+                self.add_error(orig_note)
                 continue
 
-            clinical_note = self.remove_trailing_whitespace(orig_clinical_note)
+            note_text = self.remove_trailing_whitespace(orig_note_text)
+            orig_note_ref = f"{orig_note['resourceType']}/{orig_note['id']}"
 
             try:
                 completion_class = chat.ParsedChatCompletion[self.response_format]
                 response = await nlp.cache_wrapper(
                     self.task_config.dir_phi,
                     f"{self.name}_v{self.task_version}",
-                    clinical_note,
+                    note_text,
                     lambda x: completion_class.model_validate_json(x),  # from file
                     lambda x: x.model_dump_json(  # to file
                         indent=None, round_trip=True, exclude_unset=True, by_alias=True
                     ),
                     client.prompt,
                     self.system_prompt,
-                    self.get_user_prompt(clinical_note),
+                    self.get_user_prompt(note_text),
                     self.response_format,
                 )
-            except openai.APIError as exc:
-                logging.warning(
-                    f"Could not connect to NLP server for DocRef {orig_docref['id']}: {exc}"
-                )
-                self.add_error(orig_docref)
-                continue
-            except pydantic.ValidationError as exc:
-                logging.warning(
-                    f"Could not process answer from NLP server for DocRef {orig_docref['id']}: {exc}"
-                )
-                self.add_error(orig_docref)
+            except Exception as exc:
+                logging.warning(f"NLP failed for {orig_note_ref}: {exc}")
+                self.add_error(orig_note)
                 continue
 
             choice = response.choices[0]
 
             if choice.finish_reason != "stop" or not choice.message.parsed:
                 logging.warning(
-                    f"NLP server response didn't complete for DocRef {orig_docref['id']}: "
+                    f"NLP server response didn't complete for {orig_note_ref}: "
                     f"{choice.finish_reason}"
                 )
-                self.add_error(orig_docref)
+                self.add_error(orig_note)
                 continue
 
             parsed = choice.message.parsed.model_dump(mode="json")
-            self.post_process(parsed, orig_clinical_note, orig_docref)
+            self.post_process(parsed, orig_note_text, orig_note)
 
             yield {
-                "note_ref": f"DocumentReference/{docref_id}",
+                "note_ref": note_ref,
                 "encounter_ref": f"Encounter/{encounter_id}",
                 "subject_ref": f"Patient/{subject_id}",
                 # Since this date is stored as a string, use UTC time for easy comparisons
@@ -202,11 +196,11 @@ async def read_entries(self, *, progress: rich.progress.Progress = None) -> task
             }
 
     @classmethod
-    def get_user_prompt(cls, clinical_note: str) -> str:
+    def get_user_prompt(cls, note_text: str) -> str:
         prompt = cls.user_prompt or "%CLINICAL-NOTE%"
-        return prompt.replace("%CLINICAL-NOTE%", clinical_note)
+        return prompt.replace("%CLINICAL-NOTE%", note_text)
 
-    def post_process(self, parsed: dict, orig_clinical_note: str, orig_docref: dict) -> None:
+    def post_process(self, parsed: dict, orig_note_text: str, orig_note: dict) -> None:
         """Subclasses can fill this out if they like"""
 
     @classmethod
@@ -261,7 +255,7 @@ class BaseOpenAiTaskWithSpans(BaseOpenAiTask):
     It assumes the field is named "spans" in the top level of the pydantic model.
     """
 
-    def post_process(self, parsed: dict, orig_clinical_note: str, orig_docref: dict) -> None:
+    def post_process(self, parsed: dict, orig_note_text: str, orig_note: dict) -> None:
         new_spans = []
         missed_some = False
 
@@ -278,18 +272,18 @@ def post_process(self, parsed: dict, orig_clinical_note: str, orig_docref: dict)
             span = ESCAPED_WHITESPACE.sub(r"\\s+", span)
 
             found = False
-            for match in re.finditer(span, orig_clinical_note, re.IGNORECASE):
+            for match in re.finditer(span, orig_note_text, re.IGNORECASE):
                 found = True
                 new_spans.append(match.span())
             if not found:
                 missed_some = True
                 logging.warning(
                     "Could not match span received from NLP server for "
-                    f"DocRef {orig_docref['id']}: {orig_span}"
+                    f"{orig_note['resourceType']}/{orig_note['id']}: {orig_span}"
                 )
 
         if missed_some:
-            self.add_error(orig_docref)
+            self.add_error(orig_note)
 
         parsed["spans"] = new_spans
 
diff --git a/cumulus_etl/export/cli.py b/cumulus_etl/export/cli.py
@@ -1,6 +1,7 @@
 """Do a standalone bulk export from an EHR"""
 
 import argparse
+import itertools
 import sys
 
 from cumulus_etl import cli_utils, common, errors, fhir, loaders, store
@@ -32,7 +33,7 @@ async def export_main(args: argparse.Namespace) -> None:
     store.set_user_fs_options(vars(args))
 
     selected_tasks = task_factory.get_selected_tasks(args.task)
-    required_resources = {t.resource for t in selected_tasks}
+    required_resources = set().union(*(t.get_resource_types() for t in selected_tasks))
     using_default_tasks = not args.task
 
     # Fold in manually specified --type args (very similar to --task, but more familiar to folks
diff --git a/cumulus_etl/nlp/__init__.py b/cumulus_etl/nlp/__init__.py
@@ -2,7 +2,7 @@
 
 from .extract import TransformerModel, ctakes_extract, ctakes_httpx_client, list_polarity
 from .openai import Gpt4Model, Gpt4oModel, Gpt5Model, Gpt35Model, GptOss120bModel, Llama4ScoutModel
-from .utils import cache_wrapper, get_docref_info, is_docref_valid
+from .utils import cache_wrapper, get_note_info, is_note_valid
 from .watcher import (
     check_ctakes,
     check_negation_cnlpt,
diff --git a/cumulus_etl/nlp/utils.py b/cumulus_etl/nlp/utils.py
diff --git a/tests/nlp/test_cohort.py b/tests/nlp/test_cohort.py
diff --git a/tests/nlp/test_openai.py b/tests/nlp/test_openai.py