feat: add example__* tasks

mikix · mikix · commit 54edb8613948 · 2025-08-22T12:31:07.000-04:00
These are bare bone prompts, which can run on sample shipped data.
They are meant to be used when following some docs on how to step
through the NLP workflow.
diff --git a/cumulus_etl/etl/pipeline.py b/cumulus_etl/etl/pipeline.py
@@ -174,6 +174,9 @@ async def run_pipeline(
     # record filesystem options like --s3-region before creating Roots
     store.set_user_fs_options(vars(args))
 
+    if args.dir_input == "%EXAMPLE%" and not os.path.exists(args.dir_input):
+        args.dir_input = os.path.join(os.path.dirname(__file__), "studies/example/ndjson")
+
     root_input = store.Root(args.dir_input)
     root_output = store.Root(args.dir_output)
     root_phi = store.Root(args.dir_phi, create=True)
diff --git a/cumulus_etl/etl/studies/example/__init__.py b/cumulus_etl/etl/studies/example/__init__.py
@@ -0,0 +1,7 @@
+"""The example study"""
+
+from .example_tasks import ExampleGpt4oTask as ExampleGpt4oTask
+from .example_tasks import ExampleGpt4Task as ExampleGpt4Task
+from .example_tasks import ExampleGpt5Task as ExampleGpt5Task
+from .example_tasks import ExampleGptOss120bTask as ExampleGptOss120bTask
+from .example_tasks import ExampleLlama4ScoutTask as ExampleLlama4ScoutTask
diff --git a/cumulus_etl/etl/studies/example/example_tasks.py b/cumulus_etl/etl/studies/example/example_tasks.py
@@ -0,0 +1,65 @@
+"""Define tasks for the example/sample study"""
+
+import json
+
+import pydantic
+
+from cumulus_etl import nlp
+from cumulus_etl.etl import tasks
+
+
+class AgeMention(pydantic.BaseModel):
+    has_mention: bool | None = pydantic.Field(None)
+    spans: list[str] = pydantic.Field(default_factory=list, description="Supporting text spans")
+    age: int | None = pydantic.Field(None, description="The age of the patient")
+
+
+class BaseExampleTask(tasks.BaseOpenAiTaskWithSpans):
+    task_version = 0
+    # Task Version History:
+    # ** 0 (2025-08): Initial work, still in flux **
+
+    system_prompt = (
+        "You are a clinical chart reviewer.\n"
+        "Your task is to extract patient-specific information from an unstructured clinical "
+        "document and map it into a predefined Pydantic schema.\n"
+        "\n"
+        "Core Rules:\n"
+        "1. Base all assertions ONLY on patient-specific information in the clinical document.\n"
+        "   - Never negate or exclude information just because it is not mentioned.\n"
+        "   - Never conflate family history or population-level risk with patient findings.\n"
+        "2. Do not invent or infer facts beyond what is documented.\n"
+        "3. Maintain high fidelity to the clinical document language when citing spans.\n"
+        "4. Always produce structured JSON that conforms to the Pydantic schema provided below.\n"
+        "\n"
+        "Pydantic Schema:\n" + json.dumps(AgeMention.model_json_schema())
+    )
+    response_format = AgeMention
+
+
+# Have a task for every ETL-supported model, to allow sites to choose whatever model works for them.
+
+
+class ExampleGpt4Task(BaseExampleTask):
+    name = "example__nlp_gpt4"
+    client_class = nlp.Gpt4Model
+
+
+class ExampleGpt4oTask(BaseExampleTask):
+    name = "example__nlp_gpt4o"
+    client_class = nlp.Gpt4oModel
+
+
+class ExampleGpt5Task(BaseExampleTask):
+    name = "example__nlp_gpt5"
+    client_class = nlp.Gpt5Model
+
+
+class ExampleGptOss120bTask(BaseExampleTask):
+    name = "example__nlp_gpt_oss_120b"
+    client_class = nlp.GptOss120bModel
+
+
+class ExampleLlama4ScoutTask(BaseExampleTask):
+    name = "example__nlp_llama4_scout"
+    client_class = nlp.Llama4ScoutModel
diff --git a/cumulus_etl/etl/studies/example/ndjson/docref.ndjson.gz b/cumulus_etl/etl/studies/example/ndjson/docref.ndjson.gz
diff --git a/cumulus_etl/etl/studies/example/ndjson/dxreport.ndjson.gz b/cumulus_etl/etl/studies/example/ndjson/dxreport.ndjson.gz
diff --git a/cumulus_etl/etl/tasks/nlp_task.py b/cumulus_etl/etl/tasks/nlp_task.py
@@ -6,6 +6,7 @@
 import re
 import string
 import sys
+import types
 import typing
 from collections.abc import AsyncIterator, Callable
 from typing import ClassVar
@@ -223,7 +224,7 @@ def convert_pydantic_fields_to_pyarrow(
     ) -> pyarrow.DataType:
         return pyarrow.struct(
             [
-                pyarrow.field(name, cls._convert_type_to_pyarrow(info.annotation))
+                pyarrow.field(name, cls._convert_type_to_pyarrow(info.annotation), nullable=True)
                 for name, info in fields.items()
             ]
         )
@@ -232,16 +233,23 @@ def convert_pydantic_fields_to_pyarrow(
     def _convert_type_to_pyarrow(cls, annotation) -> pyarrow.DataType:
         # Since we only need to handle a small amount of possible types, we just do this ourselves
         # rather than relying on an external library.
-        if issubclass(annotation, str):
+        if origin := typing.get_origin(annotation):  # e.g. "UnionType" or "list"
+            sub_type = typing.get_args(annotation)[0]
+            if issubclass(origin, types.UnionType):
+                # This is gonna be something like "str | None" so just grab first arg.
+                # We mark all our fields are nullable at the pyarrow layer.
+                return cls._convert_type_to_pyarrow(sub_type)
+            elif issubclass(origin, list):
+                # Note: does not handle struct types underneath yet
+                return pyarrow.list_(cls._convert_type_to_pyarrow(sub_type))
+        elif issubclass(annotation, str):
             return pyarrow.string()
         elif issubclass(annotation, bool):
             return pyarrow.bool_()
-        elif issubclass(typing.get_origin(annotation), list):
-            sub_type = typing.get_args(annotation)[0]
-            # Note: does not handle struct types underneath yet
-            return pyarrow.list_(cls._convert_type_to_pyarrow(sub_type))
-        else:
-            raise ValueError(f"Unsupported type {annotation}")  # pragma: no cover
+        elif issubclass(annotation, int):
+            return pyarrow.int32()
+
+        raise ValueError(f"Unsupported type {annotation}")  # pragma: no cover
 
 
 class BaseOpenAiTaskWithSpans(BaseOpenAiTask):
diff --git a/cumulus_etl/etl/tasks/task_factory.py b/cumulus_etl/etl/tasks/task_factory.py
@@ -5,7 +5,7 @@
 from typing import TypeVar
 
 from cumulus_etl import cli_utils, errors
-from cumulus_etl.etl.studies import covid_symptom, irae
+from cumulus_etl.etl.studies import covid_symptom, example, irae
 from cumulus_etl.etl.tasks import basic_tasks
 
 AnyTask = TypeVar("AnyTask", bound="EtlTask")  # noqa: F821
@@ -31,6 +31,11 @@ def get_nlp_tasks() -> list[type[AnyTask]]:
         covid_symptom.CovidSymptomNlpResultsGpt4Task,
         covid_symptom.CovidSymptomNlpResultsTask,
         covid_symptom.CovidSymptomNlpResultsTermExistsTask,
+        example.ExampleGpt4Task,
+        example.ExampleGpt4oTask,
+        example.ExampleGpt5Task,
+        example.ExampleGptOss120bTask,
+        example.ExampleLlama4ScoutTask,
         irae.IraeGptOss120bTask,
         irae.IraeGpt4oTask,
         irae.IraeGpt5Task,
diff --git a/cumulus_etl/nlp/openai.py b/cumulus_etl/nlp/openai.py
@@ -31,9 +31,9 @@ async def post_init_check(self) -> None:
         try:
             models = self.client.models.list()
             names = {model.id async for model in models}
-        except openai.APIError:
+        except openai.APIError as exc:
             errors.fatal(
-                f"NLP server '{self.USER_ID}' is unreachable.\n"
+                f"NLP server '{self.USER_ID}' is unreachable: {exc}.\n"
                 f"If it's a local server, try running 'docker compose up {self.USER_ID} --wait'.",
                 errors.SERVICE_MISSING,
             )
@@ -45,6 +45,9 @@ async def post_init_check(self) -> None:
             )
 
     async def prompt(self, system: str, user: str, schema: BaseModel) -> chat.ParsedChatCompletion:
+        return await self._parse_prompt(system, user, schema)
+
+    async def _parse_prompt(self, system: str, user: str, schema) -> chat.ParsedChatCompletion:
         return await self.client.chat.completions.parse(
             model=self.MODEL_NAME,
             messages=[
@@ -75,12 +78,20 @@ async def pre_init_check(cls) -> None:
             errors.fatal("\n".join(messages), errors.ARGS_INVALID)
 
     def make_client(self) -> openai.AsyncOpenAI:
-        return openai.AsyncAzureOpenAI(api_version="2024-06-01")
+        return openai.AsyncAzureOpenAI(api_version="2024-10-21")
 
 
-class Gpt35Model(AzureModel):
+class Gpt35Model(AzureModel):  # deprecated, do not use in new code (doesn't support JSON schemas)
     MODEL_NAME = "gpt-35-turbo-0125"
 
+    # 3.5 doesn't support a pydantic JSON schema, so we do some work to keep it using the same API
+    # as the rest of our code.
+    async def prompt(self, system: str, user: str, schema: BaseModel) -> chat.ParsedChatCompletion:
+        response = await self._parse_prompt(system, user, {"type": "json_object"})
+        parsed = schema.model_validate_json(response.choices[0].message.content)
+        response.choices[0].message.parsed = parsed
+        return response
+
 
 class Gpt4Model(AzureModel):
     MODEL_NAME = "gpt-4"
diff --git a/docs/nlp/example.md b/docs/nlp/example.md
@@ -0,0 +1,116 @@
+---
+title: Example Workflow
+parent: NLP
+grand_parent: ETL
+nav_order: 1
+# audience: engineer familiar with the project
+# type: tutorial
+---
+
+# An Example NLP Workflow
+
+Let's work through an end-to-end NLP workflow, as if you were doing a real study.
+But we'll use an example study shipped with the ETL for testing purposes instead.
+
+This will take us from the initial actual NLP, then to chart review,
+then finally to analyzing accuracy.
+
+You don't need to prepare your own clinical notes for this run-through.
+We'll use synthetic notes shipped with Cumulus ETL for this very purpose.
+
+This example study we'll use is just a very simple age range study.
+The NLP will only be tasked with extracting an age from a clinical note.
+
+## The NLP Itself
+
+Before we start, you'll need to have Cumulus ETL and your AWS infrastructure ready.
+Follow the [setup instructions](../setup) if you haven't done so already, then come back here.
+
+### Model Setup
+
+You have a choice of model for this.
+Real studies might require one specific model or another.
+But this example task is fairly liberal.
+
+Here are the options, along with the task name to use.
+
+#### Azure Cloud Options
+For these, you'll want to set a couple variables first:
+```sh
+export AZURE_OPENAI_API_KEY=xxx
+export AZURE_OPENAI_ENDPOINT=https://xxx.openai.azure.com/
+```
+
+Task names:
+- GPT4: `example__nlp_gpt4`
+- GPT4o: `example__nlp_gpt4o`
+- GPT5: `example__nlp_gpt5`
+
+#### Local (On-Prem) Options
+For these, you'll need to start up the appropriate model on your machine:
+```sh
+docker compose up --wait gpt-oss-120b
+```
+
+Task names:
+- GPT-OSS 120B (needs 80GB of GPU memory): `example__nlp_gpt_oss_120b`
+
+### Running the ETL
+
+Now that your model is ready, let's run the ETL on some notes!
+
+Below is the command line to use.
+You'll need to change the bucket names and paths to wherever you set up your AWS infrastructure.
+And you'll want to change the task name as appropriate for your model.
+Leave the odd looking `%EXAMPLE%` bit in place;
+that just tells Cumulus ETL to use its built-in example documents as the input.
+
+The output and PHI bucket locations should be the same as your normal ETL runs on raw FHIR data.
+There's no actual PHI in this example run because of the synthetic data,
+but normally there is, and that PHI bucket is where Cumulus ETL keeps caches of NLP results.
+
+```sh
+docker compose run --rm \
+  cumulus-etl nlp \
+  %EXAMPLE% \
+  s3://my-output-bucket/ \
+  s3://my-phi-bucket/ \
+  --task example__nlp_gpt4
+```
+
+(If this were a real study, you'd probably do this a bit differently.
+You'd point at your real DocumentReference resources for example.
+And you'd probably restrict the set of documents you run NLP on with an argument
+like `--cohort-athena-table study__my_cohort`.
+But for this run-through, we're going to hand-wave all the document selection pieces.)
+
+### Running the Crawler
+
+Whenever you write a new table to S3, you'll want to run your AWS Glue crawler again,
+so that the table's schema gets set correctly in Athena.
+
+First, confirm that your AWS Cloud Formation templates have the `example__nlp_*` tables
+configured in them. If not, try copying the Glue crawler definition from
+[the sample template we provide](../setup/aws.md).
+
+Then go to your AWS console, in the AWS Glue service, in the sidebar under Data Catalog, and
+choose Crawlers.
+You should see your crawler listed there. Select it, click Run, and wait for it to finish.
+
+### Confirm the Data in Athena
+
+While you're in the AWS console, switch to the Athena service and select the appropriate
+Cumulus workgroup and database.
+
+Then if you make a query like below (assuming you used the GPT4 model),
+you should see eight results with extracted ages.
+```sql
+select * from example__nlp_gpt4
+```
+
+**Congratulations!**
+You've now run NLP on some synthetic clinical notes and uploaded the results to Athena.
+Those extracted ages could now be post-processed by the `example` study to calculate age ranges,
+and then confirmed with chart review by humans.
+
+At least that's the flow you'd use for a real study.
diff --git a/docs/nlp/index.md b/docs/nlp/index.md
@@ -2,6 +2,7 @@
 title: NLP
 parent: ETL
 nav_order: 5
+has_children: true
 # audience: non-programmers, conversational tone, selling a bit
 # type: explanation
 ---
diff --git a/docs/setup/cumulus-aws-template.yaml b/docs/setup/cumulus-aws-template.yaml
@@ -211,6 +211,14 @@ Resources:
               - !Sub "s3://${S3Bucket}/${EtlSubdir}/irae__nlp_llama4_scout"
             CreateNativeDeltaTable: True
             WriteManifest: False
+          - DeltaTables:
+              - !Sub "s3://${S3Bucket}/${EtlSubdir}/example__nlp_gpt4"
+              - !Sub "s3://${S3Bucket}/${EtlSubdir}/example__nlp_gpt4o"
+              - !Sub "s3://${S3Bucket}/${EtlSubdir}/example__nlp_gpt5"
+              - !Sub "s3://${S3Bucket}/${EtlSubdir}/example__nlp_gpt_oss_120b"
+              - !Sub "s3://${S3Bucket}/${EtlSubdir}/example__nlp_llama4_scout"
+            CreateNativeDeltaTable: True
+            WriteManifest: False
 
   ####################################################
   # Athena queries and where to store them
diff --git a/docs/studies/index.md b/docs/studies/index.md
@@ -13,7 +13,7 @@ In addition to the default basic-FHIR-oriented Cumulus ETL tasks like `condition
 which simply strip identifying information and largely leaves the FHIR alone,
 there are also more interesting study-oriented tasks.
 
-These tend to be [NLP](../nlp.md) tasks that extract information from clinical notes.
+These tend to be [NLP](../nlp) tasks that extract information from clinical notes.
 
 They aren't run by default,
 but you can provide the ones you are interested in with the `--task` parameter.
diff --git a/tests/covid_symptom/test_covid_gpt.py b/tests/covid_symptom/test_covid_gpt.py
@@ -53,7 +53,7 @@ async def test_gpt4_changes(self):
 
     async def test_happy_path(self):
         self.make_json("DocumentReference", "1", **i2b2_mock_data.documentreference("foo"))
-        self.mock_response()
+        self.mock_response(parsed=False)
 
         task = covid_symptom.CovidSymptomNlpResultsGpt35Task(self.job_config, self.scrubber)
         await task.run()
@@ -75,7 +75,7 @@ async def test_happy_path(self):
                 "seed": 12345,
                 "temperature": 0,
                 "timeout": 120,
-                "response_format": CovidSymptoms,
+                "response_format": {"type": "json_object"},
             },
             self.mock_create.call_args_list[0][1],
         )
diff --git a/tests/nlp/test_example.py b/tests/nlp/test_example.py
@@ -0,0 +1,29 @@
+"""Tests for etl/studies/example/"""
+
+import ddt
+import pydantic
+
+from cumulus_etl.etl.studies.example.example_tasks import AgeMention
+from tests.etl import BaseEtlSimple
+from tests.nlp.utils import OpenAITestCase
+
+
+@ddt.ddt
+class TestExampleTask(OpenAITestCase, BaseEtlSimple):
+    """Test case for example tasks"""
+
+    def default_content(self) -> pydantic.BaseModel:
+        return AgeMention(has_mention=True, spans=["year-old"], age=20)
+
+    @ddt.data(
+        "example__nlp_gpt_oss_120b",
+        "example__nlp_gpt4",
+        "example__nlp_gpt4o",
+        "example__nlp_gpt5",
+        "example__nlp_llama4_scout",
+    )
+    async def test_basic_etl(self, task_name):
+        for _ in range(8):
+            self.mock_response()
+        await self.run_etl(tasks=[task_name], input_path="%EXAMPLE%")
+        self.assertEqual(self.mock_create.call_count, 8)
diff --git a/tests/nlp/utils.py b/tests/nlp/utils.py