Skip to content

Commit bd62e8a

Browse files
authored
Merge pull request #465 from smart-on-fhir/mikix/irae-split
irae: split one task into two
2 parents 61c9fbd + 0e613c8 commit bd62e8a

File tree

11 files changed

+204
-79
lines changed

11 files changed

+204
-79
lines changed
Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
"""The irae study"""
22

3-
from .irae_tasks import IraeClaudeSonnet45Task as IraeClaudeSonnet45Task
4-
from .irae_tasks import IraeGpt4oTask as IraeGpt4oTask
5-
from .irae_tasks import IraeGpt5Task as IraeGpt5Task
6-
from .irae_tasks import IraeGptOss120bTask as IraeGptOss120bTask
7-
from .irae_tasks import IraeLlama4ScoutTask as IraeLlama4ScoutTask
3+
from .irae_tasks import IraeDonorClaudeSonnet45Task as IraeDonorClaudeSonnet45Task
4+
from .irae_tasks import IraeDonorGpt4oTask as IraeDonorGpt4oTask
5+
from .irae_tasks import IraeDonorGpt5Task as IraeDonorGpt5Task
6+
from .irae_tasks import IraeDonorGptOss120bTask as IraeDonorGptOss120bTask
7+
from .irae_tasks import IraeDonorLlama4ScoutTask as IraeDonorLlama4ScoutTask
8+
from .irae_tasks import IraeLongitudinalClaudeSonnet45Task as IraeLongitudinalClaudeSonnet45Task
9+
from .irae_tasks import IraeLongitudinalGpt4oTask as IraeLongitudinalGpt4oTask
10+
from .irae_tasks import IraeLongitudinalGpt5Task as IraeLongitudinalGpt5Task
11+
from .irae_tasks import IraeLongitudinalGptOss120bTask as IraeLongitudinalGptOss120bTask
12+
from .irae_tasks import IraeLongitudinalLlama4ScoutTask as IraeLongitudinalLlama4ScoutTask

cumulus_etl/etl/studies/irae/irae_tasks.py

Lines changed: 74 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Define tasks for the irae study"""
22

3-
import json
43
from enum import StrEnum
54

65
from pydantic import BaseModel, Field
@@ -10,7 +9,7 @@
109

1110

1211
class SpanAugmentedMention(BaseModel):
13-
has_mention: bool | None # True, False, or None
12+
has_mention: bool # True, False
1413
spans: list[str]
1514

1615

@@ -23,7 +22,10 @@ class SpanAugmentedMention(BaseModel):
2322

2423
# Dates are treated as strings - no enum needed
2524
class DonorTransplantDateMention(SpanAugmentedMention):
26-
donor_transplant_date: str | None = Field(None, description="Date of renal transplant")
25+
donor_transplant_date: str | None = Field(
26+
None,
27+
description="Exact date of renal transplant; use YYYY-MM-DD format in your response. Only highlight date mentions with an explicit day, month, and year (e.g. 2020-01-15). All other date mentions, or an absence of a date mention, should be indicated with None.",
28+
)
2729

2830

2931
class DonorType(StrEnum):
@@ -51,9 +53,11 @@ class DonorRelationshipMention(SpanAugmentedMention):
5153

5254

5355
class DonorHlaMatchQuality(StrEnum):
54-
WELL = "Well matched (0-1 mismatches)"
55-
MODERATE = "Moderately matched (2-4 mismatches)"
56-
POOR = "Poorly matched (5-6 mismatches)"
56+
WELL = "Well matched (0-1 mismatches) OR recipient explicitly documented as not sensitized"
57+
MODERATE = (
58+
"Moderately matched (2-4 mismatches) OR recipient explicitly documented as sensitized"
59+
)
60+
POOR = "Poorly matched (5-6 mismatches) OR recipient explicitly documented as highly sensitized"
5761
NOT_MENTIONED = "HLA match quality not mentioned"
5862

5963

@@ -354,7 +358,7 @@ class DeceasedMention(SpanAugmentedMention):
354358
deceased_date: str | None = Field(
355359
None,
356360
description=(
357-
"If the patient is deceased, include the date the patient became deceased. "
361+
"If the patient is deceased, include the date the patient became deceased. Use YYYY-MM-DD format if possible. "
358362
"Use None if there is no date recorded or if the patient is not observed as deceased."
359363
),
360364
)
@@ -365,7 +369,7 @@ class DeceasedMention(SpanAugmentedMention):
365369
###############################################################################
366370

367371

368-
class KidneyTransplantAnnotation(BaseModel):
372+
class KidneyTransplantDonorGroupAnnotation(BaseModel):
369373
"""
370374
An object-model for annotations of immune related adverse event (IRAE)
371375
observations found in a patient's chart, relating specifically to kidney
@@ -381,6 +385,24 @@ class KidneyTransplantAnnotation(BaseModel):
381385
donor_relationship_mention: DonorRelationshipMention
382386
donor_hla_match_quality_mention: DonorHlaMatchQualityMention
383387
donor_hla_mismatch_count_mention: DonorHlaMismatchCountMention
388+
389+
390+
class KidneyTransplantLongitudinalAnnotation(BaseModel):
391+
"""
392+
An object-model for annotations of immune related adverse event (IRAE)
393+
observations found in a patient's chart, relating specifically to kidney
394+
transplants.
395+
396+
This class only includes longitudinally variable mentions, i.e. those
397+
that can change over time, such as therapeutic status, compliance, infections,
398+
graft rejection/failure, DSA, PTLD, cancer, and deceased status.
399+
400+
Take care to avoid false positives, like confusing information that only
401+
appears in family history for patient history. Annotations should indicate
402+
the relevant details of the finding, as well as some additional evidence
403+
metadata to validate findings post-hoc.
404+
"""
405+
384406
rx_therapeutic_status_mention: RxTherapeuticStatusMention
385407
rx_compliance_mention: RxComplianceMention
386408
dsa_mention: DSAMention
@@ -396,8 +418,9 @@ class KidneyTransplantAnnotation(BaseModel):
396418

397419

398420
class BaseIraeTask(tasks.BaseModelTaskWithSpans):
399-
task_version = 3
421+
task_version = 4
400422
# Task Version History:
423+
# ** 4 (2025-10): Split into donor & longitudinal models **
401424
# ** 3 (2025-10): New serialized format **
402425
# ** 2 (2025-09): Updated prompt and pydantic models **
403426
# ** 1 (2025-08): Updated prompt **
@@ -419,37 +442,72 @@ class BaseIraeTask(tasks.BaseModelTaskWithSpans):
419442
" BIOPSY_PROVEN > CONFIRMED > SUSPECTED > NONE_OF_THE_ABOVE.\n"
420443
"5. Always produce structured JSON that conforms to the Pydantic schema provided below.\n"
421444
"\n"
422-
"Pydantic Schema:\n" + json.dumps(KidneyTransplantAnnotation.model_json_schema())
445+
"Pydantic Schema:\n"
446+
"%JSON-SCHEMA%"
423447
)
424448
user_prompt = (
425449
"Evaluate the following clinical document for kidney transplant variables and outcomes.\n"
426450
"Here is the clinical document for you to analyze:\n"
427451
"\n"
428452
"%CLINICAL-NOTE%"
429453
)
430-
response_format = KidneyTransplantAnnotation
431454

432455

433-
class IraeGpt4oTask(BaseIraeTask):
456+
class IraeDonorGpt4oTask(BaseIraeTask):
457+
name = "irae__nlp_donor_gpt4o"
458+
client_class = nlp.Gpt4oModel
459+
response_format = KidneyTransplantDonorGroupAnnotation
460+
461+
462+
class IraeLongitudinalGpt4oTask(BaseIraeTask):
434463
name = "irae__nlp_gpt4o"
435464
client_class = nlp.Gpt4oModel
465+
response_format = KidneyTransplantLongitudinalAnnotation
436466

437467

438-
class IraeGpt5Task(BaseIraeTask):
468+
class IraeDonorGpt5Task(BaseIraeTask):
469+
name = "irae__nlp_donor_gpt5"
470+
client_class = nlp.Gpt5Model
471+
response_format = KidneyTransplantDonorGroupAnnotation
472+
473+
474+
class IraeLongitudinalGpt5Task(BaseIraeTask):
439475
name = "irae__nlp_gpt5"
440476
client_class = nlp.Gpt5Model
477+
response_format = KidneyTransplantLongitudinalAnnotation
478+
479+
480+
class IraeDonorGptOss120bTask(BaseIraeTask):
481+
name = "irae__nlp_donor_gpt_oss_120b"
482+
client_class = nlp.GptOss120bModel
483+
response_format = KidneyTransplantDonorGroupAnnotation
441484

442485

443-
class IraeGptOss120bTask(BaseIraeTask):
486+
class IraeLongitudinalGptOss120bTask(BaseIraeTask):
444487
name = "irae__nlp_gpt_oss_120b"
445488
client_class = nlp.GptOss120bModel
489+
response_format = KidneyTransplantLongitudinalAnnotation
490+
491+
492+
class IraeDonorLlama4ScoutTask(BaseIraeTask):
493+
name = "irae__nlp_donor_llama4_scout"
494+
client_class = nlp.Llama4ScoutModel
495+
response_format = KidneyTransplantDonorGroupAnnotation
446496

447497

448-
class IraeLlama4ScoutTask(BaseIraeTask):
498+
class IraeLongitudinalLlama4ScoutTask(BaseIraeTask):
449499
name = "irae__nlp_llama4_scout"
450500
client_class = nlp.Llama4ScoutModel
501+
response_format = KidneyTransplantLongitudinalAnnotation
502+
503+
504+
class IraeDonorClaudeSonnet45Task(BaseIraeTask):
505+
name = "irae__nlp_donor_claude_sonnet45"
506+
client_class = nlp.ClaudeSonnet45Model
507+
response_format = KidneyTransplantDonorGroupAnnotation
451508

452509

453-
class IraeClaudeSonnet45Task(BaseIraeTask):
510+
class IraeLongitudinalClaudeSonnet45Task(BaseIraeTask):
454511
name = "irae__nlp_claude_sonnet45"
455512
client_class = nlp.ClaudeSonnet45Model
513+
response_format = KidneyTransplantLongitudinalAnnotation

cumulus_etl/etl/tasks/nlp_task.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Base NLP task support"""
22

33
import copy
4+
import json
45
import logging
56
import os
67
import re
@@ -126,10 +127,10 @@ class BaseModelTask(BaseNlpTask):
126127
outputs: ClassVar = [tasks.OutputTable(resource_type=None, uniqueness_fields={"note_ref"})]
127128

128129
# If you change these prompts, consider updating task_version.
129-
system_prompt: ClassVar = None
130-
user_prompt: ClassVar = None
131-
client_class: ClassVar = None
132-
response_format: ClassVar = None
130+
system_prompt: str = None
131+
user_prompt: str = None
132+
client_class: type[nlp.Model] = None
133+
response_format: type[pydantic.BaseModel] = None
133134

134135
def __init__(self, *args, **kwargs):
135136
super().__init__(*args, **kwargs)
@@ -153,7 +154,7 @@ async def read_entries(self, *, progress: rich.progress.Progress = None) -> task
153154

154155
try:
155156
response = await self.model.prompt(
156-
self.system_prompt,
157+
self.get_system_prompt(),
157158
self.get_user_prompt(note_text),
158159
schema=self.response_format,
159160
cache_dir=self.task_config.dir_phi,
@@ -206,6 +207,12 @@ def finish_task(self) -> None:
206207

207208
rich.get_console().print(table)
208209

210+
@classmethod
211+
def get_system_prompt(cls) -> str:
212+
return cls.system_prompt.replace(
213+
"%JSON-SCHEMA%", json.dumps(cls.response_format.model_json_schema())
214+
)
215+
209216
@classmethod
210217
def get_user_prompt(cls, note_text: str) -> str:
211218
prompt = cls.user_prompt or "%CLINICAL-NOTE%"

cumulus_etl/etl/tasks/task_factory.py

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Finds and creates ETL tasks"""
22

3+
import inspect
34
import sys
45
from collections.abc import Iterable
56
from typing import TypeVar
@@ -25,22 +26,15 @@ def get_all_tasks() -> list[type[AnyTask]]:
2526
]
2627

2728

29+
def get_classes_from_module(module) -> list[type[AnyTask]]:
30+
return [x[1] for x in inspect.getmembers(module, inspect.isclass)]
31+
32+
2833
def get_nlp_tasks() -> list[type[AnyTask]]:
2934
return [
30-
covid_symptom.CovidSymptomNlpResultsGpt35Task,
31-
covid_symptom.CovidSymptomNlpResultsGpt4Task,
32-
covid_symptom.CovidSymptomNlpResultsTask,
33-
covid_symptom.CovidSymptomNlpResultsTermExistsTask,
34-
example.ExampleGpt4Task,
35-
example.ExampleGpt4oTask,
36-
example.ExampleGpt5Task,
37-
example.ExampleGptOss120bTask,
38-
example.ExampleLlama4ScoutTask,
39-
irae.IraeClaudeSonnet45Task,
40-
irae.IraeGptOss120bTask,
41-
irae.IraeGpt4oTask,
42-
irae.IraeGpt5Task,
43-
irae.IraeLlama4ScoutTask,
35+
*get_classes_from_module(covid_symptom),
36+
*get_classes_from_module(example),
37+
*get_classes_from_module(irae),
4438
]
4539

4640

cumulus_etl/nlp/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
Gpt35Model,
1010
GptOss120bModel,
1111
Llama4ScoutModel,
12+
Model,
1213
TokenStats,
1314
set_nlp_provider,
1415
)

docs/setup/cumulus-aws-template.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,15 @@ Resources:
209209
- !Sub "s3://${S3Bucket}/${EtlSubdir}/covid_symptom__nlp_results_gpt35"
210210
- !Sub "s3://${S3Bucket}/${EtlSubdir}/covid_symptom__nlp_results_gpt4"
211211
- !Sub "s3://${S3Bucket}/${EtlSubdir}/covid_symptom__nlp_results_term_exists"
212+
CreateNativeDeltaTable: True
213+
WriteManifest: False
214+
- DeltaTables:
215+
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_claude_sonnet45"
216+
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_donor_claude_sonnet45"
217+
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_donor_gpt4o"
218+
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_donor_gpt5"
219+
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_donor_gpt_oss_120b"
220+
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_donor_llama4_scout"
212221
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_gpt4o"
213222
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_gpt5"
214223
- !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_gpt_oss_120b"
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"note_ref": "DocumentReference/c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d", "encounter_ref": "Encounter/b3d0707624491d8b71a808bd20b63625981af48f526b95214146de2a15f7dd43", "subject_ref": "Patient/00680c7c0e2e1712e9c4a01eb5c6dfb8949871faef6337c5db204d19e1d9ca58", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 4, "system_fingerprint": "test-fp", "result": {"donor_transplant_date_mention": {"has_mention": false, "spans": []}, "donor_type_mention": {"has_mention": false, "spans": [], "donor_type": "Donor was not mentioned as living or deceased"}, "donor_relationship_mention": {"has_mention": false, "spans": [], "donor_relationship": "Donor relationship status was not mentioned"}, "donor_hla_match_quality_mention": {"has_mention": false, "spans": [], "donor_hla_match_quality": "HLA match quality not mentioned"}, "donor_hla_mismatch_count_mention": {"has_mention": false, "spans": [], "donor_hla_mismatch_count": "HLA mismatch count not mentioned"}}}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"note_ref": "DocumentReference/c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d", "encounter_ref": "Encounter/b3d0707624491d8b71a808bd20b63625981af48f526b95214146de2a15f7dd43", "subject_ref": "Patient/00680c7c0e2e1712e9c4a01eb5c6dfb8949871faef6337c5db204d19e1d9ca58", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 4, "system_fingerprint": "test-fp", "result": {"rx_therapeutic_status_mention": {"has_mention": false, "spans": [], "rx_therapeutic_status": "None of the above"}, "rx_compliance_mention": {"has_mention": false, "spans": [], "rx_compliance": "None of the above"}, "dsa_mention": {"has_mention": false, "spans": [], "dsa_history": false, "dsa": "None of the above"}, "infection_mention": {"has_mention": false, "spans": [], "infection_history": false, "infection": "None of the above"}, "viral_infection_mention": {"has_mention": false, "spans": [], "viral_infection_history": false, "viral_infection": "None of the above"}, "bacterial_infection_mention": {"has_mention": false, "spans": [], "bacterial_infection_history": false, "bacterial_infection": "None of the above"}, "fungal_infection_mention": {"has_mention": false, "spans": [], "fungal_infection_history": false, "fungal_infection": "None of the above"}, "graft_rejection_mention": {"has_mention": false, "spans": [], "graft_rejection_history": false, "graft_rejection": "None of the above"}, "graft_failure_mention": {"has_mention": false, "spans": [], "graft_failure_history": false, "graft_failure": "None of the above"}, "ptld_mention": {"has_mention": false, "spans": [], "ptld_history": false, "ptld": "None of the above"}, "cancer_mention": {"has_mention": false, "spans": [], "cancer_history": false, "cancer": "None of the above"}, "deceased_mention": {"has_mention": true, "spans": [[5, 9]], "deceased": true, "deceased_date": "2025-10-10"}}}

tests/data/irae/output.ndjson

Lines changed: 0 additions & 1 deletion
This file was deleted.

0 commit comments

Comments
 (0)