Merge pull request #467 from smart-on-fhir/mikix/note-ordering

mikix · web-flow · commit 593cf0fad5bc · 2025-10-23T10:05:16.000-04:00
irae: add an early-exit condition for NLP if patient is in an end state
diff --git a/cumulus_etl/etl/studies/covid_symptom/covid_ctakes.py b/cumulus_etl/etl/studies/covid_symptom/covid_ctakes.py
@@ -32,7 +32,7 @@ async def covid_symptoms_extract(
     :return: list of NLP results encoded as FHIR observations
     """
     try:
-        note_ref, encounter_id, subject_id = nlp.get_note_info(docref)
+        note_ref, encounter_id, subject_ref = nlp.get_note_info(docref)
     except KeyError as exc:
         logging.warning(exc)
         return None
@@ -108,7 +108,7 @@ def _make_covid_symptom_row(row_id: str, match: dict | None) -> dict:
             "id": row_id,
             "docref_id": docref_id,
             "encounter_id": encounter_id,
-            "subject_id": subject_id,
+            "subject_id": subject_ref.split("/")[-1],
             "generated_on": timestamp,
             "task_version": task_version,
             "match": match,
diff --git a/cumulus_etl/etl/studies/irae/irae_tasks.py b/cumulus_etl/etl/studies/irae/irae_tasks.py
@@ -1,10 +1,14 @@
 """Define tasks for the irae study"""
 
+import datetime
+import logging
+from collections.abc import Generator, Iterator
 from enum import StrEnum
 
+import cumulus_fhir_support as cfs
 from pydantic import BaseModel, Field
 
-from cumulus_etl import nlp
+from cumulus_etl import common, nlp, store
 from cumulus_etl.etl import tasks
 
 
@@ -453,61 +457,124 @@ class BaseIraeTask(tasks.BaseModelTaskWithSpans):
     )
 
 
-class IraeDonorGpt4oTask(BaseIraeTask):
+class BaseDonorIraeTask(BaseIraeTask):
+    response_format = KidneyTransplantDonorGroupAnnotation
+
+
+class BaseLongitudinalIraeTask(BaseIraeTask):
+    response_format = KidneyTransplantLongitudinalAnnotation
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.subject_refs_to_skip = set()
+
+    @staticmethod
+    def ndjson_in_order(input_root: store.Root, resource: str) -> Generator[dict]:
+        # To avoid loading all the notes into memory, we'll first go through each note, and keep
+        # track of their byte offset on disk and their date. Then we'll grab each from disk in
+        # order.
+
+        # Get a list of all files we're going to be working with here
+        filenames = common.ls_resources(input_root, {resource})
+
+        # Go through all files, keeping a record of each line's dates and offsets.
+        note_info = []
+        for file_index, path in enumerate(filenames):
+            for row in cfs.read_multiline_json_with_details(path, fsspec_fs=input_root.fs):
+                date = nlp.get_note_date(row["json"]) or datetime.datetime.max
+                note_info.append((date, file_index, row["byte_offset"]))
+
+        # Now yield each note again in order, reading each from disk
+        note_info.sort()
+        for _date, file_index, offset in note_info:
+            rows = cfs.read_multiline_json_with_details(
+                filenames[file_index],
+                offset=offset,
+                fsspec_fs=input_root.fs,
+            )
+            # StopIteration errors shouldn't happen here, because we just went through these
+            # files above, but just to be safe, we'll gracefully intercept it.
+            try:
+                yield next(rows)["json"]
+            except StopIteration:  # pragma: no cover
+                logging.warning(
+                    f"File '{filenames[file_index]}' changed while reading, skipping some notes."
+                )
+                continue
+
+    # Override the read-from-disk portion, so we can order notes in oldest-to-newest order
+    def read_ndjson_from_disk(self, input_root: store.Root, resource: str) -> Iterator[dict]:
+        yield from self.ndjson_in_order(input_root, resource)
+
+    def should_skip(self, orig_note: dict) -> bool:
+        subject_ref = nlp.get_note_subject_ref(orig_note)
+        return subject_ref in self.subject_refs_to_skip or super().should_skip(orig_note)
+
+    def post_process(self, parsed: dict, orig_note_text: str, orig_note: dict) -> None:
+        super().post_process(parsed, orig_note_text, orig_note)
+
+        # If we have an annotation that asserts a graft failure or deceased,
+        # we can stop processing charts for that patient, to avoid pointless NLP requests.
+
+        graft_failure = parsed.get("graft_failure_mention", {})
+        is_failed = (
+            graft_failure.get("has_mention")
+            and graft_failure.get("graft_failure") == GraftFailurePresent.CONFIRMED
+        )
+
+        deceased = parsed.get("deceased_mention", {})
+        is_deceased = deceased.get("has_mention") and deceased.get("deceased")
+
+        if is_failed or is_deceased:
+            if subject_ref := nlp.get_note_subject_ref(orig_note):
+                self.subject_refs_to_skip.add(subject_ref)
+
+
+class IraeDonorGpt4oTask(BaseDonorIraeTask):
     name = "irae__nlp_donor_gpt4o"
     client_class = nlp.Gpt4oModel
-    response_format = KidneyTransplantDonorGroupAnnotation
 
 
-class IraeLongitudinalGpt4oTask(BaseIraeTask):
+class IraeLongitudinalGpt4oTask(BaseLongitudinalIraeTask):
     name = "irae__nlp_gpt4o"
     client_class = nlp.Gpt4oModel
-    response_format = KidneyTransplantLongitudinalAnnotation
 
 
-class IraeDonorGpt5Task(BaseIraeTask):
+class IraeDonorGpt5Task(BaseDonorIraeTask):
     name = "irae__nlp_donor_gpt5"
     client_class = nlp.Gpt5Model
-    response_format = KidneyTransplantDonorGroupAnnotation
 
 
-class IraeLongitudinalGpt5Task(BaseIraeTask):
+class IraeLongitudinalGpt5Task(BaseLongitudinalIraeTask):
     name = "irae__nlp_gpt5"
     client_class = nlp.Gpt5Model
-    response_format = KidneyTransplantLongitudinalAnnotation
 
 
-class IraeDonorGptOss120bTask(BaseIraeTask):
+class IraeDonorGptOss120bTask(BaseDonorIraeTask):
     name = "irae__nlp_donor_gpt_oss_120b"
     client_class = nlp.GptOss120bModel
-    response_format = KidneyTransplantDonorGroupAnnotation
 
 
-class IraeLongitudinalGptOss120bTask(BaseIraeTask):
+class IraeLongitudinalGptOss120bTask(BaseLongitudinalIraeTask):
     name = "irae__nlp_gpt_oss_120b"
     client_class = nlp.GptOss120bModel
-    response_format = KidneyTransplantLongitudinalAnnotation
 
 
-class IraeDonorLlama4ScoutTask(BaseIraeTask):
+class IraeDonorLlama4ScoutTask(BaseDonorIraeTask):
     name = "irae__nlp_donor_llama4_scout"
     client_class = nlp.Llama4ScoutModel
-    response_format = KidneyTransplantDonorGroupAnnotation
 
 
-class IraeLongitudinalLlama4ScoutTask(BaseIraeTask):
+class IraeLongitudinalLlama4ScoutTask(BaseLongitudinalIraeTask):
     name = "irae__nlp_llama4_scout"
     client_class = nlp.Llama4ScoutModel
-    response_format = KidneyTransplantLongitudinalAnnotation
 
 
-class IraeDonorClaudeSonnet45Task(BaseIraeTask):
+class IraeDonorClaudeSonnet45Task(BaseDonorIraeTask):
     name = "irae__nlp_donor_claude_sonnet45"
     client_class = nlp.ClaudeSonnet45Model
-    response_format = KidneyTransplantDonorGroupAnnotation
 
 
-class IraeLongitudinalClaudeSonnet45Task(BaseIraeTask):
+class IraeLongitudinalClaudeSonnet45Task(BaseLongitudinalIraeTask):
     name = "irae__nlp_claude_sonnet45"
     client_class = nlp.ClaudeSonnet45Model
-    response_format = KidneyTransplantLongitudinalAnnotation
diff --git a/cumulus_etl/etl/tasks/base.py b/cumulus_etl/etl/tasks/base.py
@@ -371,6 +371,9 @@ def _write_errors(self, batch: formats.Batch, batch_index: int) -> None:
     #
     ##########################################################################################
 
+    def read_ndjson_from_disk(self, input_root: store.Root, resource: str) -> Iterator[dict]:
+        yield from common.read_resource_ndjson(input_root, resource)
+
     def read_ndjson(
         self, *, progress: rich.progress.Progress | None = None, resources: list[str] | None = None
     ) -> Iterator[dict]:
@@ -399,7 +402,7 @@ def read_ndjson(
         # You may want to process all linked resources first, and only then the "real" resource
         # (like we do for Medications and MedicationRequests).
         for resource in resources:
-            for line in common.read_resource_ndjson(input_root, resource):
+            for line in self.read_ndjson_from_disk(input_root, resource):
                 yield line
                 if progress:
                     progress.advance(row_task)
diff --git a/cumulus_etl/etl/tasks/nlp_task.py b/cumulus_etl/etl/tasks/nlp_task.py
@@ -142,8 +142,11 @@ async def init_check(cls) -> None:
 
     async def read_entries(self, *, progress: rich.progress.Progress = None) -> tasks.EntryIterator:
         async for orig_note, note, orig_note_text in self.read_notes(progress=progress):
+            if self.should_skip(orig_note):
+                continue
+
             try:
-                note_ref, encounter_id, subject_id = nlp.get_note_info(note)
+                note_ref, encounter_id, subject_ref = nlp.get_note_info(note)
             except KeyError as exc:
                 logging.warning(exc)
                 self.add_error(orig_note)
@@ -172,7 +175,7 @@ async def read_entries(self, *, progress: rich.progress.Progress = None) -> task
             yield {
                 "note_ref": note_ref,
                 "encounter_ref": f"Encounter/{encounter_id}",
-                "subject_ref": f"Patient/{subject_id}",
+                "subject_ref": subject_ref,
                 # Since this date is stored as a string, use UTC time for easy comparisons
                 "generated_on": common.datetime_now().isoformat(),
                 "task_version": self.task_version,
@@ -218,6 +221,10 @@ def get_user_prompt(cls, note_text: str) -> str:
         prompt = cls.user_prompt or "%CLINICAL-NOTE%"
         return prompt.replace("%CLINICAL-NOTE%", note_text)
 
+    def should_skip(self, orig_note: dict) -> bool:
+        """Subclasses can fill this out if they like, to skip notes"""
+        return False
+
     def post_process(self, parsed: dict, orig_note_text: str, orig_note: dict) -> None:
         """Subclasses can fill this out if they like"""
 
diff --git a/cumulus_etl/formats/factory.py b/cumulus_etl/formats/factory.py
@@ -16,4 +16,4 @@ def get_format_class(name: str) -> type[Format]:
     try:
         return classes[name]
     except KeyError as exc:
-        raise ValueError(f"Unknown output format name {name}.") from exc
+        raise ValueError(f"Unknown output format name '{name}'.") from exc
diff --git a/cumulus_etl/nlp/__init__.py b/cumulus_etl/nlp/__init__.py
@@ -14,7 +14,7 @@
     set_nlp_provider,
 )
 from .selection import CsvMatcher, add_note_selection, get_note_filter, query_athena_table
-from .utils import cache_wrapper, get_note_info, is_note_valid
+from .utils import cache_wrapper, get_note_date, get_note_info, get_note_subject_ref, is_note_valid
 from .watcher import (
     check_ctakes,
     check_negation_cnlpt,
diff --git a/cumulus_etl/nlp/utils.py b/cumulus_etl/nlp/utils.py
@@ -1,5 +1,6 @@
 """Misc NLP functions"""
 
+import datetime
 import hashlib
 import os
 from collections.abc import Callable
@@ -35,7 +36,7 @@ async def is_note_valid(codebook: deid.Codebook, note: dict) -> bool:
 
 def get_note_info(note: dict) -> tuple[str, str, str]:
     """
-    Returns note_ref, encounter_id, subject_id for the given DocRef/DxReport.
+    Returns note_ref, encounter_id, subject_ref for the given DocRef/DxReport.
 
     Raises KeyError if any of them aren't present.
     """
@@ -44,10 +45,43 @@ def get_note_info(note: dict) -> tuple[str, str, str]:
     if not encounters:  # check for dxreport encounter field
         encounters = [note["encounter"]] if "encounter" in note else []
     if not encounters:
-        raise KeyError(f"No encounters for note {note_ref}")
+        raise KeyError(f"No encounters for note '{note_ref}'")
     _, encounter_id = fhir.unref_resource(encounters[0])
-    _, subject_id = fhir.unref_resource(note["subject"])
-    return note_ref, encounter_id, subject_id
+    subject_ref = get_note_subject_ref(note)
+    if not subject_ref:
+        raise KeyError(f"No subject for note '{note_ref}'")
+    return note_ref, encounter_id, subject_ref
+
+
+def get_note_subject_ref(note: dict) -> str | None:
+    """Returns the subject ref of a note, suitable for cross-referencing across notes"""
+    try:
+        subject_type, subject_id = fhir.unref_resource(note.get("subject"))
+    except ValueError:
+        return None
+
+    if subject_type:
+        return f"{subject_type}/{subject_id}"
+    else:
+        # avoids dealing with contained refs or other oddities that won't match across notes
+        return None
+
+
+def get_note_date(note: dict) -> datetime.datetime | None:
+    """Returns the date of a note - preferring clinical dates, then administrative ones"""
+    if note.get("resourceType") == "DiagnosticReport":
+        if time := fhir.parse_datetime(note.get("effectiveDateTime")):
+            return time
+        if time := fhir.parse_datetime(note.get("effectivePeriod", {}).get("start")):
+            return time
+        if time := fhir.parse_datetime(note.get("issued")):
+            return time
+    elif note.get("resourceType") == "DocumentReference":
+        if time := fhir.parse_datetime(note.get("context", {}).get("period", {}).get("start")):
+            return time
+        if time := fhir.parse_datetime(note.get("date")):
+            return time
+    return None
 
 
 async def cache_wrapper(
diff --git a/cumulus_etl/upload_notes/cli.py b/cumulus_etl/upload_notes/cli.py
@@ -55,23 +55,6 @@ async def gather_resources(
     )
 
 
-def datetime_from_resource(resource: dict) -> datetime.datetime | None:
-    """Returns the date of a resource - preferring clinical dates, then administrative ones"""
-    if resource["resourceType"] == "DiagnosticReport":
-        if time := fhir.parse_datetime(resource.get("effectiveDateTime")):
-            return time
-        if time := fhir.parse_datetime(resource.get("effectivePeriod", {}).get("start")):
-            return time
-        if time := fhir.parse_datetime(resource.get("issued")):
-            return time
-    elif resource["resourceType"] == "DocumentReference":
-        if time := fhir.parse_datetime(resource.get("context", {}).get("period", {}).get("start")):
-            return time
-        if time := fhir.parse_datetime(resource.get("date")):
-            return time
-    return None
-
-
 def _get_encounter_id(resource: dict) -> str | None:
     encounter_ref = None
     if resource["resourceType"] == "DiagnosticReport":
@@ -158,7 +141,7 @@ async def read_notes_from_ndjson(
                 doc_spans=doc_spans,
                 title=title,
                 text=text,
-                date=datetime_from_resource(resource),
+                date=nlp.get_note_date(resource),
             )
         )
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ dependencies = [
     "aiobotocore[boto3] >= 2.14.0",
     "boto3 >= 1.34.131",
     "ctakesclient >= 5.1",
-    "cumulus-fhir-support >= 1.6",
+    "cumulus-fhir-support >= 1.8",
     "delta-spark >= 4, < 5",
     "fsspec[http,s3]",
     "httpx",
diff --git a/tests/etl/test_etl_context.py b/tests/etl/test_etl_context.py
@@ -14,6 +14,7 @@ class TestJobContext(utils.AsyncTestCase):
     def test_missing_file_context(self):
         context = JobContext("nope")
         self.assertEqual({}, context.as_json())
+        self.assertIsNone(context.last_successful_datetime)
 
     def test_save_and_load(self):
         with tempfile.NamedTemporaryFile(mode="w+") as f:
diff --git a/tests/formats/test_formats.py b/tests/formats/test_formats.py
@@ -0,0 +1,8 @@
+from cumulus_etl import formats
+from tests import utils
+
+
+class TestFormats(utils.AsyncTestCase):
+    def test_invalid_format(self):
+        with self.assertRaisesRegex(ValueError, "Unknown output format name 'blarg'."):
+            formats.get_format_class("blarg")
diff --git a/tests/nlp/test_irae.py b/tests/nlp/test_irae.py
diff --git a/tests/nlp/test_utils.py b/tests/nlp/test_utils.py

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@`
`14`	`14`	`set_nlp_provider,`
`15`	`15`	`)`
`16`	`16`	`from .selection import CsvMatcher, add_note_selection, get_note_filter, query_athena_table`
`17`		`-from .utils import cache_wrapper, get_note_info, is_note_valid`
	`17`	`+from .utils import cache_wrapper, get_note_date, get_note_info, get_note_subject_ref, is_note_valid`
`18`	`18`	`from .watcher import (`
`19`	`19`	`check_ctakes,`
`20`	`20`	`check_negation_cnlpt,`