Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions cumulus_etl/etl/studies/irae/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
"""The irae study"""

from .irae_tasks import IraeClaudeSonnet45Task as IraeClaudeSonnet45Task
from .irae_tasks import IraeGpt4oTask as IraeGpt4oTask
from .irae_tasks import IraeGpt5Task as IraeGpt5Task
from .irae_tasks import IraeGptOss120bTask as IraeGptOss120bTask
from .irae_tasks import IraeLlama4ScoutTask as IraeLlama4ScoutTask
from .irae_tasks import IraeDonorClaudeSonnet45Task as IraeDonorClaudeSonnet45Task
from .irae_tasks import IraeDonorGpt4oTask as IraeDonorGpt4oTask
from .irae_tasks import IraeDonorGpt5Task as IraeDonorGpt5Task
from .irae_tasks import IraeDonorGptOss120bTask as IraeDonorGptOss120bTask
from .irae_tasks import IraeDonorLlama4ScoutTask as IraeDonorLlama4ScoutTask
from .irae_tasks import IraeLongitudinalClaudeSonnet45Task as IraeLongitudinalClaudeSonnet45Task
from .irae_tasks import IraeLongitudinalGpt4oTask as IraeLongitudinalGpt4oTask
from .irae_tasks import IraeLongitudinalGpt5Task as IraeLongitudinalGpt5Task
from .irae_tasks import IraeLongitudinalGptOss120bTask as IraeLongitudinalGptOss120bTask
from .irae_tasks import IraeLongitudinalLlama4ScoutTask as IraeLongitudinalLlama4ScoutTask
90 changes: 74 additions & 16 deletions cumulus_etl/etl/studies/irae/irae_tasks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Define tasks for the irae study"""

import json
from enum import StrEnum

from pydantic import BaseModel, Field
Expand All @@ -10,7 +9,7 @@


class SpanAugmentedMention(BaseModel):
has_mention: bool | None # True, False, or None
has_mention: bool # True, False
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are also some light model changes here, grabbed from Dylan's latest.

spans: list[str]


Expand All @@ -23,7 +22,10 @@ class SpanAugmentedMention(BaseModel):

# Dates are treated as strings - no enum needed
class DonorTransplantDateMention(SpanAugmentedMention):
donor_transplant_date: str | None = Field(None, description="Date of renal transplant")
donor_transplant_date: str | None = Field(
None,
description="Exact date of renal transplant; use YYYY-MM-DD format in your response. Only highlight date mentions with an explicit day, month, and year (e.g. 2020-01-15). All other date mentions, or an absence of a date mention, should be indicated with None.",
)


class DonorType(StrEnum):
Expand Down Expand Up @@ -51,9 +53,11 @@ class DonorRelationshipMention(SpanAugmentedMention):


class DonorHlaMatchQuality(StrEnum):
WELL = "Well matched (0-1 mismatches)"
MODERATE = "Moderately matched (2-4 mismatches)"
POOR = "Poorly matched (5-6 mismatches)"
WELL = "Well matched (0-1 mismatches) OR recipient explicitly documented as not sensitized"
MODERATE = (
"Moderately matched (2-4 mismatches) OR recipient explicitly documented as sensitized"
)
POOR = "Poorly matched (5-6 mismatches) OR recipient explicitly documented as highly sensitized"
NOT_MENTIONED = "HLA match quality not mentioned"


Expand Down Expand Up @@ -354,7 +358,7 @@ class DeceasedMention(SpanAugmentedMention):
deceased_date: str | None = Field(
None,
description=(
"If the patient is deceased, include the date the patient became deceased. "
"If the patient is deceased, include the date the patient became deceased. Use YYYY-MM-DD format if possible. "
"Use None if there is no date recorded or if the patient is not observed as deceased."
),
)
Expand All @@ -365,7 +369,7 @@ class DeceasedMention(SpanAugmentedMention):
###############################################################################


class KidneyTransplantAnnotation(BaseModel):
class KidneyTransplantDonorGroupAnnotation(BaseModel):
"""
An object-model for annotations of immune related adverse event (IRAE)
observations found in a patient's chart, relating specifically to kidney
Expand All @@ -381,6 +385,24 @@ class KidneyTransplantAnnotation(BaseModel):
donor_relationship_mention: DonorRelationshipMention
donor_hla_match_quality_mention: DonorHlaMatchQualityMention
donor_hla_mismatch_count_mention: DonorHlaMismatchCountMention


class KidneyTransplantLongitudinalAnnotation(BaseModel):
"""
An object-model for annotations of immune related adverse event (IRAE)
observations found in a patient's chart, relating specifically to kidney
transplants.

This class only includes longitudinally variable mentions, i.e. those
that can change over time, such as therapeutic status, compliance, infections,
graft rejection/failure, DSA, PTLD, cancer, and deceased status.

Take care to avoid false positives, like confusing information that only
appears in family history for patient history. Annotations should indicate
the relevant details of the finding, as well as some additional evidence
metadata to validate findings post-hoc.
"""

rx_therapeutic_status_mention: RxTherapeuticStatusMention
rx_compliance_mention: RxComplianceMention
dsa_mention: DSAMention
Expand All @@ -396,8 +418,9 @@ class KidneyTransplantAnnotation(BaseModel):


class BaseIraeTask(tasks.BaseModelTaskWithSpans):
task_version = 3
task_version = 4
# Task Version History:
# ** 4 (2025-10): Split into donor & longitudinal models **
# ** 3 (2025-10): New serialized format **
# ** 2 (2025-09): Updated prompt and pydantic models **
# ** 1 (2025-08): Updated prompt **
Expand All @@ -419,37 +442,72 @@ class BaseIraeTask(tasks.BaseModelTaskWithSpans):
" BIOPSY_PROVEN > CONFIRMED > SUSPECTED > NONE_OF_THE_ABOVE.\n"
"5. Always produce structured JSON that conforms to the Pydantic schema provided below.\n"
"\n"
"Pydantic Schema:\n" + json.dumps(KidneyTransplantAnnotation.model_json_schema())
"Pydantic Schema:\n"
"%JSON-SCHEMA%"
)
user_prompt = (
"Evaluate the following clinical document for kidney transplant variables and outcomes.\n"
"Here is the clinical document for you to analyze:\n"
"\n"
"%CLINICAL-NOTE%"
)
response_format = KidneyTransplantAnnotation


class IraeGpt4oTask(BaseIraeTask):
class IraeDonorGpt4oTask(BaseIraeTask):
name = "irae__nlp_donor_gpt4o"
client_class = nlp.Gpt4oModel
response_format = KidneyTransplantDonorGroupAnnotation


class IraeLongitudinalGpt4oTask(BaseIraeTask):
name = "irae__nlp_gpt4o"
client_class = nlp.Gpt4oModel
response_format = KidneyTransplantLongitudinalAnnotation


class IraeGpt5Task(BaseIraeTask):
class IraeDonorGpt5Task(BaseIraeTask):
name = "irae__nlp_donor_gpt5"
client_class = nlp.Gpt5Model
response_format = KidneyTransplantDonorGroupAnnotation


class IraeLongitudinalGpt5Task(BaseIraeTask):
name = "irae__nlp_gpt5"
client_class = nlp.Gpt5Model
response_format = KidneyTransplantLongitudinalAnnotation


class IraeDonorGptOss120bTask(BaseIraeTask):
name = "irae__nlp_donor_gpt_oss_120b"
client_class = nlp.GptOss120bModel
response_format = KidneyTransplantDonorGroupAnnotation


class IraeGptOss120bTask(BaseIraeTask):
class IraeLongitudinalGptOss120bTask(BaseIraeTask):
name = "irae__nlp_gpt_oss_120b"
client_class = nlp.GptOss120bModel
response_format = KidneyTransplantLongitudinalAnnotation


class IraeDonorLlama4ScoutTask(BaseIraeTask):
name = "irae__nlp_donor_llama4_scout"
client_class = nlp.Llama4ScoutModel
response_format = KidneyTransplantDonorGroupAnnotation


class IraeLlama4ScoutTask(BaseIraeTask):
class IraeLongitudinalLlama4ScoutTask(BaseIraeTask):
name = "irae__nlp_llama4_scout"
client_class = nlp.Llama4ScoutModel
response_format = KidneyTransplantLongitudinalAnnotation


class IraeDonorClaudeSonnet45Task(BaseIraeTask):
name = "irae__nlp_donor_claude_sonnet45"
client_class = nlp.ClaudeSonnet45Model
response_format = KidneyTransplantDonorGroupAnnotation


class IraeClaudeSonnet45Task(BaseIraeTask):
class IraeLongitudinalClaudeSonnet45Task(BaseIraeTask):
name = "irae__nlp_claude_sonnet45"
client_class = nlp.ClaudeSonnet45Model
response_format = KidneyTransplantLongitudinalAnnotation
17 changes: 12 additions & 5 deletions cumulus_etl/etl/tasks/nlp_task.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Base NLP task support"""

import copy
import json
import logging
import os
import re
Expand Down Expand Up @@ -126,10 +127,10 @@ class BaseModelTask(BaseNlpTask):
outputs: ClassVar = [tasks.OutputTable(resource_type=None, uniqueness_fields={"note_ref"})]

# If you change these prompts, consider updating task_version.
system_prompt: ClassVar = None
user_prompt: ClassVar = None
client_class: ClassVar = None
response_format: ClassVar = None
system_prompt: str = None
user_prompt: str = None
client_class: type[nlp.Model] = None
response_format: type[pydantic.BaseModel] = None

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -153,7 +154,7 @@ async def read_entries(self, *, progress: rich.progress.Progress = None) -> task

try:
response = await self.model.prompt(
self.system_prompt,
self.get_system_prompt(),
self.get_user_prompt(note_text),
schema=self.response_format,
cache_dir=self.task_config.dir_phi,
Expand Down Expand Up @@ -206,6 +207,12 @@ def finish_task(self) -> None:

rich.get_console().print(table)

@classmethod
def get_system_prompt(cls) -> str:
return cls.system_prompt.replace(
"%JSON-SCHEMA%", json.dumps(cls.response_format.model_json_schema())
)

@classmethod
def get_user_prompt(cls, note_text: str) -> str:
prompt = cls.user_prompt or "%CLINICAL-NOTE%"
Expand Down
22 changes: 8 additions & 14 deletions cumulus_etl/etl/tasks/task_factory.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Finds and creates ETL tasks"""

import inspect
import sys
from collections.abc import Iterable
from typing import TypeVar
Expand All @@ -25,22 +26,15 @@ def get_all_tasks() -> list[type[AnyTask]]:
]


def get_classes_from_module(module) -> list[type[AnyTask]]:
return [x[1] for x in inspect.getmembers(module, inspect.isclass)]


def get_nlp_tasks() -> list[type[AnyTask]]:
return [
covid_symptom.CovidSymptomNlpResultsGpt35Task,
covid_symptom.CovidSymptomNlpResultsGpt4Task,
covid_symptom.CovidSymptomNlpResultsTask,
covid_symptom.CovidSymptomNlpResultsTermExistsTask,
example.ExampleGpt4Task,
example.ExampleGpt4oTask,
example.ExampleGpt5Task,
example.ExampleGptOss120bTask,
example.ExampleLlama4ScoutTask,
irae.IraeClaudeSonnet45Task,
irae.IraeGptOss120bTask,
irae.IraeGpt4oTask,
irae.IraeGpt5Task,
irae.IraeLlama4ScoutTask,
*get_classes_from_module(covid_symptom),
*get_classes_from_module(example),
*get_classes_from_module(irae),
]


Expand Down
1 change: 1 addition & 0 deletions cumulus_etl/nlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
Gpt35Model,
GptOss120bModel,
Llama4ScoutModel,
Model,
TokenStats,
set_nlp_provider,
)
Expand Down
9 changes: 9 additions & 0 deletions docs/setup/cumulus-aws-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,15 @@ Resources:
- !Sub "s3://${S3Bucket}/${EtlSubdir}/covid_symptom__nlp_results_gpt35"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/covid_symptom__nlp_results_gpt4"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/covid_symptom__nlp_results_term_exists"
CreateNativeDeltaTable: True
WriteManifest: False
- DeltaTables:
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_claude_sonnet45"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_donor_claude_sonnet45"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_donor_gpt4o"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_donor_gpt5"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_donor_gpt_oss_120b"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_donor_llama4_scout"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_gpt4o"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_gpt5"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_gpt_oss_120b"
Expand Down
1 change: 1 addition & 0 deletions tests/data/irae/donor-output.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"note_ref": "DocumentReference/c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d", "encounter_ref": "Encounter/b3d0707624491d8b71a808bd20b63625981af48f526b95214146de2a15f7dd43", "subject_ref": "Patient/00680c7c0e2e1712e9c4a01eb5c6dfb8949871faef6337c5db204d19e1d9ca58", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 4, "system_fingerprint": "test-fp", "result": {"donor_transplant_date_mention": {"has_mention": false, "spans": []}, "donor_type_mention": {"has_mention": false, "spans": [], "donor_type": "Donor was not mentioned as living or deceased"}, "donor_relationship_mention": {"has_mention": false, "spans": [], "donor_relationship": "Donor relationship status was not mentioned"}, "donor_hla_match_quality_mention": {"has_mention": false, "spans": [], "donor_hla_match_quality": "HLA match quality not mentioned"}, "donor_hla_mismatch_count_mention": {"has_mention": false, "spans": [], "donor_hla_mismatch_count": "HLA mismatch count not mentioned"}}}
1 change: 1 addition & 0 deletions tests/data/irae/longitudinal-output.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"note_ref": "DocumentReference/c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d", "encounter_ref": "Encounter/b3d0707624491d8b71a808bd20b63625981af48f526b95214146de2a15f7dd43", "subject_ref": "Patient/00680c7c0e2e1712e9c4a01eb5c6dfb8949871faef6337c5db204d19e1d9ca58", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 4, "system_fingerprint": "test-fp", "result": {"rx_therapeutic_status_mention": {"has_mention": false, "spans": [], "rx_therapeutic_status": "None of the above"}, "rx_compliance_mention": {"has_mention": false, "spans": [], "rx_compliance": "None of the above"}, "dsa_mention": {"has_mention": false, "spans": [], "dsa_history": false, "dsa": "None of the above"}, "infection_mention": {"has_mention": false, "spans": [], "infection_history": false, "infection": "None of the above"}, "viral_infection_mention": {"has_mention": false, "spans": [], "viral_infection_history": false, "viral_infection": "None of the above"}, "bacterial_infection_mention": {"has_mention": false, "spans": [], "bacterial_infection_history": false, "bacterial_infection": "None of the above"}, "fungal_infection_mention": {"has_mention": false, "spans": [], "fungal_infection_history": false, "fungal_infection": "None of the above"}, "graft_rejection_mention": {"has_mention": false, "spans": [], "graft_rejection_history": false, "graft_rejection": "None of the above"}, "graft_failure_mention": {"has_mention": false, "spans": [], "graft_failure_history": false, "graft_failure": "None of the above"}, "ptld_mention": {"has_mention": false, "spans": [], "ptld_history": false, "ptld": "None of the above"}, "cancer_mention": {"has_mention": false, "spans": [], "cancer_history": false, "cancer": "None of the above"}, "deceased_mention": {"has_mention": true, "spans": [[5, 9]], "deceased": true, "deceased_date": "2025-10-10"}}}
1 change: 0 additions & 1 deletion tests/data/irae/output.ndjson

This file was deleted.

Loading