Merge pull request #465 from smart-on-fhir/mikix/irae-split

mikix · web-flow · commit bd62e8a63c78 · 2025-10-22T10:45:01.000-04:00
irae: split one task into two
diff --git a/cumulus_etl/etl/studies/irae/__init__.py b/cumulus_etl/etl/studies/irae/__init__.py
@@ -1,7 +1,12 @@
 """The irae study"""
 
-from .irae_tasks import IraeClaudeSonnet45Task as IraeClaudeSonnet45Task
-from .irae_tasks import IraeGpt4oTask as IraeGpt4oTask
-from .irae_tasks import IraeGpt5Task as IraeGpt5Task
-from .irae_tasks import IraeGptOss120bTask as IraeGptOss120bTask
-from .irae_tasks import IraeLlama4ScoutTask as IraeLlama4ScoutTask
+from .irae_tasks import IraeDonorClaudeSonnet45Task as IraeDonorClaudeSonnet45Task
+from .irae_tasks import IraeDonorGpt4oTask as IraeDonorGpt4oTask
+from .irae_tasks import IraeDonorGpt5Task as IraeDonorGpt5Task
+from .irae_tasks import IraeDonorGptOss120bTask as IraeDonorGptOss120bTask
+from .irae_tasks import IraeDonorLlama4ScoutTask as IraeDonorLlama4ScoutTask
+from .irae_tasks import IraeLongitudinalClaudeSonnet45Task as IraeLongitudinalClaudeSonnet45Task
+from .irae_tasks import IraeLongitudinalGpt4oTask as IraeLongitudinalGpt4oTask
+from .irae_tasks import IraeLongitudinalGpt5Task as IraeLongitudinalGpt5Task
+from .irae_tasks import IraeLongitudinalGptOss120bTask as IraeLongitudinalGptOss120bTask
+from .irae_tasks import IraeLongitudinalLlama4ScoutTask as IraeLongitudinalLlama4ScoutTask
diff --git a/cumulus_etl/etl/studies/irae/irae_tasks.py b/cumulus_etl/etl/studies/irae/irae_tasks.py
@@ -1,6 +1,5 @@
 """Define tasks for the irae study"""
 
-import json
 from enum import StrEnum
 
 from pydantic import BaseModel, Field
@@ -10,7 +9,7 @@
 
 
 class SpanAugmentedMention(BaseModel):
-    has_mention: bool | None  # True, False, or None
+    has_mention: bool  # True, False
     spans: list[str]
 
 
@@ -23,7 +22,10 @@ class SpanAugmentedMention(BaseModel):
 
 # Dates are treated as strings - no enum needed
 class DonorTransplantDateMention(SpanAugmentedMention):
-    donor_transplant_date: str | None = Field(None, description="Date of renal transplant")
+    donor_transplant_date: str | None = Field(
+        None,
+        description="Exact date of renal transplant; use YYYY-MM-DD format in your response. Only highlight date mentions with an explicit day, month, and year (e.g. 2020-01-15). All other date mentions, or an absence of a date mention, should be indicated with None.",
+    )
 
 
 class DonorType(StrEnum):
@@ -51,9 +53,11 @@ class DonorRelationshipMention(SpanAugmentedMention):
 
 
 class DonorHlaMatchQuality(StrEnum):
-    WELL = "Well matched (0-1 mismatches)"
-    MODERATE = "Moderately matched (2-4 mismatches)"
-    POOR = "Poorly matched (5-6 mismatches)"
+    WELL = "Well matched (0-1 mismatches) OR recipient explicitly documented as not sensitized"
+    MODERATE = (
+        "Moderately matched (2-4 mismatches) OR recipient explicitly documented as sensitized"
+    )
+    POOR = "Poorly matched (5-6 mismatches) OR recipient explicitly documented as highly sensitized"
     NOT_MENTIONED = "HLA match quality not mentioned"
 
 
@@ -354,7 +358,7 @@ class DeceasedMention(SpanAugmentedMention):
     deceased_date: str | None = Field(
         None,
         description=(
-            "If the patient is deceased, include the date the patient became deceased. "
+            "If the patient is deceased, include the date the patient became deceased. Use YYYY-MM-DD format if possible. "
             "Use None if there is no date recorded or if the patient is not observed as deceased."
         ),
     )
@@ -365,7 +369,7 @@ class DeceasedMention(SpanAugmentedMention):
 ###############################################################################
 
 
-class KidneyTransplantAnnotation(BaseModel):
+class KidneyTransplantDonorGroupAnnotation(BaseModel):
     """
     An object-model for annotations of immune related adverse event (IRAE)
     observations found in a patient's chart, relating specifically to kidney
@@ -381,6 +385,24 @@ class KidneyTransplantAnnotation(BaseModel):
     donor_relationship_mention: DonorRelationshipMention
     donor_hla_match_quality_mention: DonorHlaMatchQualityMention
     donor_hla_mismatch_count_mention: DonorHlaMismatchCountMention
+
+
+class KidneyTransplantLongitudinalAnnotation(BaseModel):
+    """
+    An object-model for annotations of immune related adverse event (IRAE)
+    observations found in a patient's chart, relating specifically to kidney
+    transplants.
+
+    This class only includes longitudinally variable mentions, i.e. those
+    that can change over time, such as therapeutic status, compliance, infections,
+    graft rejection/failure, DSA, PTLD, cancer, and deceased status.
+
+    Take care to avoid false positives, like confusing information that only
+    appears in family history for patient history. Annotations should indicate
+    the relevant details of the finding, as well as some additional evidence
+    metadata to validate findings post-hoc.
+    """
+
     rx_therapeutic_status_mention: RxTherapeuticStatusMention
     rx_compliance_mention: RxComplianceMention
     dsa_mention: DSAMention
@@ -396,8 +418,9 @@ class KidneyTransplantAnnotation(BaseModel):
 
 
 class BaseIraeTask(tasks.BaseModelTaskWithSpans):
-    task_version = 3
+    task_version = 4
     # Task Version History:
+    # ** 4 (2025-10): Split into donor & longitudinal models **
     # ** 3 (2025-10): New serialized format **
     # ** 2 (2025-09): Updated prompt and pydantic models **
     # ** 1 (2025-08): Updated prompt **
@@ -419,37 +442,72 @@ class BaseIraeTask(tasks.BaseModelTaskWithSpans):
         "    BIOPSY_PROVEN > CONFIRMED > SUSPECTED > NONE_OF_THE_ABOVE.\n"
         "5. Always produce structured JSON that conforms to the Pydantic schema provided below.\n"
         "\n"
-        "Pydantic Schema:\n" + json.dumps(KidneyTransplantAnnotation.model_json_schema())
+        "Pydantic Schema:\n"
+        "%JSON-SCHEMA%"
     )
     user_prompt = (
         "Evaluate the following clinical document for kidney transplant variables and outcomes.\n"
         "Here is the clinical document for you to analyze:\n"
         "\n"
         "%CLINICAL-NOTE%"
     )
-    response_format = KidneyTransplantAnnotation
 
 
-class IraeGpt4oTask(BaseIraeTask):
+class IraeDonorGpt4oTask(BaseIraeTask):
+    name = "irae__nlp_donor_gpt4o"
+    client_class = nlp.Gpt4oModel
+    response_format = KidneyTransplantDonorGroupAnnotation
+
+
+class IraeLongitudinalGpt4oTask(BaseIraeTask):
     name = "irae__nlp_gpt4o"
     client_class = nlp.Gpt4oModel
+    response_format = KidneyTransplantLongitudinalAnnotation
 
 
-class IraeGpt5Task(BaseIraeTask):
+class IraeDonorGpt5Task(BaseIraeTask):
+    name = "irae__nlp_donor_gpt5"
+    client_class = nlp.Gpt5Model
+    response_format = KidneyTransplantDonorGroupAnnotation
+
+
+class IraeLongitudinalGpt5Task(BaseIraeTask):
     name = "irae__nlp_gpt5"
     client_class = nlp.Gpt5Model
+    response_format = KidneyTransplantLongitudinalAnnotation
+
+
+class IraeDonorGptOss120bTask(BaseIraeTask):
+    name = "irae__nlp_donor_gpt_oss_120b"
+    client_class = nlp.GptOss120bModel
+    response_format = KidneyTransplantDonorGroupAnnotation
 
 
-class IraeGptOss120bTask(BaseIraeTask):
+class IraeLongitudinalGptOss120bTask(BaseIraeTask):
     name = "irae__nlp_gpt_oss_120b"
     client_class = nlp.GptOss120bModel
+    response_format = KidneyTransplantLongitudinalAnnotation
+
+
+class IraeDonorLlama4ScoutTask(BaseIraeTask):
+    name = "irae__nlp_donor_llama4_scout"
+    client_class = nlp.Llama4ScoutModel
+    response_format = KidneyTransplantDonorGroupAnnotation
 
 
-class IraeLlama4ScoutTask(BaseIraeTask):
+class IraeLongitudinalLlama4ScoutTask(BaseIraeTask):
     name = "irae__nlp_llama4_scout"
     client_class = nlp.Llama4ScoutModel
+    response_format = KidneyTransplantLongitudinalAnnotation
+
+
+class IraeDonorClaudeSonnet45Task(BaseIraeTask):
+    name = "irae__nlp_donor_claude_sonnet45"
+    client_class = nlp.ClaudeSonnet45Model
+    response_format = KidneyTransplantDonorGroupAnnotation
 
 
-class IraeClaudeSonnet45Task(BaseIraeTask):
+class IraeLongitudinalClaudeSonnet45Task(BaseIraeTask):
     name = "irae__nlp_claude_sonnet45"
     client_class = nlp.ClaudeSonnet45Model
+    response_format = KidneyTransplantLongitudinalAnnotation
diff --git a/cumulus_etl/etl/tasks/nlp_task.py b/cumulus_etl/etl/tasks/nlp_task.py
@@ -1,6 +1,7 @@
 """Base NLP task support"""
 
 import copy
+import json
 import logging
 import os
 import re
@@ -126,10 +127,10 @@ class BaseModelTask(BaseNlpTask):
     outputs: ClassVar = [tasks.OutputTable(resource_type=None, uniqueness_fields={"note_ref"})]
 
     # If you change these prompts, consider updating task_version.
-    system_prompt: ClassVar = None
-    user_prompt: ClassVar = None
-    client_class: ClassVar = None
-    response_format: ClassVar = None
+    system_prompt: str = None
+    user_prompt: str = None
+    client_class: type[nlp.Model] = None
+    response_format: type[pydantic.BaseModel] = None
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -153,7 +154,7 @@ async def read_entries(self, *, progress: rich.progress.Progress = None) -> task
 
             try:
                 response = await self.model.prompt(
-                    self.system_prompt,
+                    self.get_system_prompt(),
                     self.get_user_prompt(note_text),
                     schema=self.response_format,
                     cache_dir=self.task_config.dir_phi,
@@ -206,6 +207,12 @@ def finish_task(self) -> None:
 
         rich.get_console().print(table)
 
+    @classmethod
+    def get_system_prompt(cls) -> str:
+        return cls.system_prompt.replace(
+            "%JSON-SCHEMA%", json.dumps(cls.response_format.model_json_schema())
+        )
+
     @classmethod
     def get_user_prompt(cls, note_text: str) -> str:
         prompt = cls.user_prompt or "%CLINICAL-NOTE%"
diff --git a/cumulus_etl/etl/tasks/task_factory.py b/cumulus_etl/etl/tasks/task_factory.py
@@ -1,5 +1,6 @@
 """Finds and creates ETL tasks"""
 
+import inspect
 import sys
 from collections.abc import Iterable
 from typing import TypeVar
@@ -25,22 +26,15 @@ def get_all_tasks() -> list[type[AnyTask]]:
     ]
 
 
+def get_classes_from_module(module) -> list[type[AnyTask]]:
+    return [x[1] for x in inspect.getmembers(module, inspect.isclass)]
+
+
 def get_nlp_tasks() -> list[type[AnyTask]]:
     return [
-        covid_symptom.CovidSymptomNlpResultsGpt35Task,
-        covid_symptom.CovidSymptomNlpResultsGpt4Task,
-        covid_symptom.CovidSymptomNlpResultsTask,
-        covid_symptom.CovidSymptomNlpResultsTermExistsTask,
-        example.ExampleGpt4Task,
-        example.ExampleGpt4oTask,
-        example.ExampleGpt5Task,
-        example.ExampleGptOss120bTask,
-        example.ExampleLlama4ScoutTask,
-        irae.IraeClaudeSonnet45Task,
-        irae.IraeGptOss120bTask,
-        irae.IraeGpt4oTask,
-        irae.IraeGpt5Task,
-        irae.IraeLlama4ScoutTask,
+        *get_classes_from_module(covid_symptom),
+        *get_classes_from_module(example),
+        *get_classes_from_module(irae),
     ]
 
 
diff --git a/cumulus_etl/nlp/__init__.py b/cumulus_etl/nlp/__init__.py
@@ -9,6 +9,7 @@
     Gpt35Model,
     GptOss120bModel,
     Llama4ScoutModel,
+    Model,
     TokenStats,
     set_nlp_provider,
 )
diff --git a/docs/setup/cumulus-aws-template.yaml b/docs/setup/cumulus-aws-template.yaml
@@ -209,6 +209,15 @@ Resources:
               - !Sub "s3://${S3Bucket}/${EtlSubdir}/covid_symptom__nlp_results_gpt35"
               - !Sub "s3://${S3Bucket}/${EtlSubdir}/covid_symptom__nlp_results_gpt4"
               - !Sub "s3://${S3Bucket}/${EtlSubdir}/covid_symptom__nlp_results_term_exists"
+            CreateNativeDeltaTable: True
+            WriteManifest: False
+          - DeltaTables:
+              - !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_claude_sonnet45"
+              - !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_donor_claude_sonnet45"
+              - !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_donor_gpt4o"
+              - !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_donor_gpt5"
+              - !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_donor_gpt_oss_120b"
+              - !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_donor_llama4_scout"
               - !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_gpt4o"
               - !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_gpt5"
               - !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_gpt_oss_120b"
diff --git a/tests/data/irae/donor-output.ndjson b/tests/data/irae/donor-output.ndjson
@@ -0,0 +1 @@
+{"note_ref": "DocumentReference/c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d", "encounter_ref": "Encounter/b3d0707624491d8b71a808bd20b63625981af48f526b95214146de2a15f7dd43", "subject_ref": "Patient/00680c7c0e2e1712e9c4a01eb5c6dfb8949871faef6337c5db204d19e1d9ca58", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 4, "system_fingerprint": "test-fp", "result": {"donor_transplant_date_mention": {"has_mention": false, "spans": []}, "donor_type_mention": {"has_mention": false, "spans": [], "donor_type": "Donor was not mentioned as living or deceased"}, "donor_relationship_mention": {"has_mention": false, "spans": [], "donor_relationship": "Donor relationship status was not mentioned"}, "donor_hla_match_quality_mention": {"has_mention": false, "spans": [], "donor_hla_match_quality": "HLA match quality not mentioned"}, "donor_hla_mismatch_count_mention": {"has_mention": false, "spans": [], "donor_hla_mismatch_count": "HLA mismatch count not mentioned"}}}
diff --git a/tests/data/irae/longitudinal-output.ndjson b/tests/data/irae/longitudinal-output.ndjson
@@ -0,0 +1 @@
+{"note_ref": "DocumentReference/c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d", "encounter_ref": "Encounter/b3d0707624491d8b71a808bd20b63625981af48f526b95214146de2a15f7dd43", "subject_ref": "Patient/00680c7c0e2e1712e9c4a01eb5c6dfb8949871faef6337c5db204d19e1d9ca58", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 4, "system_fingerprint": "test-fp", "result": {"rx_therapeutic_status_mention": {"has_mention": false, "spans": [], "rx_therapeutic_status": "None of the above"}, "rx_compliance_mention": {"has_mention": false, "spans": [], "rx_compliance": "None of the above"}, "dsa_mention": {"has_mention": false, "spans": [], "dsa_history": false, "dsa": "None of the above"}, "infection_mention": {"has_mention": false, "spans": [], "infection_history": false, "infection": "None of the above"}, "viral_infection_mention": {"has_mention": false, "spans": [], "viral_infection_history": false, "viral_infection": "None of the above"}, "bacterial_infection_mention": {"has_mention": false, "spans": [], "bacterial_infection_history": false, "bacterial_infection": "None of the above"}, "fungal_infection_mention": {"has_mention": false, "spans": [], "fungal_infection_history": false, "fungal_infection": "None of the above"}, "graft_rejection_mention": {"has_mention": false, "spans": [], "graft_rejection_history": false, "graft_rejection": "None of the above"}, "graft_failure_mention": {"has_mention": false, "spans": [], "graft_failure_history": false, "graft_failure": "None of the above"}, "ptld_mention": {"has_mention": false, "spans": [], "ptld_history": false, "ptld": "None of the above"}, "cancer_mention": {"has_mention": false, "spans": [], "cancer_history": false, "cancer": "None of the above"}, "deceased_mention": {"has_mention": true, "spans": [[5, 9]], "deceased": true, "deceased_date": "2025-10-10"}}}
diff --git a/tests/data/irae/output.ndjson b/tests/data/irae/output.ndjson
diff --git a/tests/nlp/test_irae.py b/tests/nlp/test_irae.py
diff --git a/tests/nlp/test_models.py b/tests/nlp/test_models.py

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@`
`9`	`9`	`Gpt35Model,`
`10`	`10`	`GptOss120bModel,`
`11`	`11`	`Llama4ScoutModel,`
	`12`	`+ Model,`
`12`	`13`	`TokenStats,`
`13`	`14`	`set_nlp_provider,`
`14`	`15`	`)`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"note_ref": "DocumentReference/c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d", "encounter_ref": "Encounter/b3d0707624491d8b71a808bd20b63625981af48f526b95214146de2a15f7dd43", "subject_ref": "Patient/00680c7c0e2e1712e9c4a01eb5c6dfb8949871faef6337c5db204d19e1d9ca58", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 4, "system_fingerprint": "test-fp", "result": {"donor_transplant_date_mention": {"has_mention": false, "spans": []}, "donor_type_mention": {"has_mention": false, "spans": [], "donor_type": "Donor was not mentioned as living or deceased"}, "donor_relationship_mention": {"has_mention": false, "spans": [], "donor_relationship": "Donor relationship status was not mentioned"}, "donor_hla_match_quality_mention": {"has_mention": false, "spans": [], "donor_hla_match_quality": "HLA match quality not mentioned"}, "donor_hla_mismatch_count_mention": {"has_mention": false, "spans": [], "donor_hla_mismatch_count": "HLA mismatch count not mentioned"}}}