Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cumulus_etl/etl/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ async def run_pipeline(
# record filesystem options like --s3-region before creating Roots
store.set_user_fs_options(vars(args))

if args.dir_input == "%EXAMPLE%" and not os.path.exists(args.dir_input):
if args.dir_input == "%EXAMPLE-NLP%" and not os.path.exists(args.dir_input):
args.dir_input = os.path.join(os.path.dirname(__file__), "studies/example/ndjson")

root_input = store.Root(args.dir_input)
Expand Down
10 changes: 5 additions & 5 deletions cumulus_etl/etl/studies/example/example_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,25 +41,25 @@ class BaseExampleTask(tasks.BaseOpenAiTaskWithSpans):


class ExampleGpt4Task(BaseExampleTask):
name = "example__nlp_gpt4"
name = "example_nlp__nlp_gpt4"
client_class = nlp.Gpt4Model


class ExampleGpt4oTask(BaseExampleTask):
name = "example__nlp_gpt4o"
name = "example_nlp__nlp_gpt4o"
client_class = nlp.Gpt4oModel


class ExampleGpt5Task(BaseExampleTask):
name = "example__nlp_gpt5"
name = "example_nlp__nlp_gpt5"
client_class = nlp.Gpt5Model


class ExampleGptOss120bTask(BaseExampleTask):
name = "example__nlp_gpt_oss_120b"
name = "example_nlp__nlp_gpt_oss_120b"
client_class = nlp.GptOss120bModel


class ExampleLlama4ScoutTask(BaseExampleTask):
name = "example__nlp_llama4_scout"
name = "example_nlp__nlp_llama4_scout"
client_class = nlp.Llama4ScoutModel
18 changes: 9 additions & 9 deletions docs/nlp/example.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ export AZURE_OPENAI_ENDPOINT=https://xxx.openai.azure.com/
```

Task names:
- GPT4: `example__nlp_gpt4`
- GPT4o: `example__nlp_gpt4o`
- GPT5: `example__nlp_gpt5`
- GPT4: `example_nlp__nlp_gpt4`
- GPT4o: `example_nlp__nlp_gpt4o`
- GPT5: `example_nlp__nlp_gpt5`

This should cost you less than 15 cents to run and could be much less depending on the model.
We'll use less than five thousand tokens.
Expand All @@ -56,7 +56,7 @@ docker compose up --wait gpt-oss-120b
```

Task names:
- GPT-OSS 120B (needs 80GB of GPU memory): `example__nlp_gpt_oss_120b`
- GPT-OSS 120B (needs 80GB of GPU memory): `example_nlp__nlp_gpt_oss_120b`

### Running the ETL

Expand All @@ -65,7 +65,7 @@ Now that your model is ready, let's run the ETL on some notes!
Below is the command line to use.
You'll need to change the bucket names and paths to wherever you set up your AWS infrastructure.
And you'll want to change the task name as appropriate for your model.
Leave the odd looking `%EXAMPLE%` bit in place;
Leave the odd looking `%EXAMPLE-NLP%` bit in place;
that just tells Cumulus ETL to use its built-in example documents as the input.

The output and PHI bucket locations should be the same as your normal ETL runs on raw FHIR data.
Expand All @@ -75,10 +75,10 @@ but normally there is, and that PHI bucket is where Cumulus ETL keeps caches of
```sh
docker compose run --rm \
cumulus-etl nlp \
%EXAMPLE% \
%EXAMPLE-NLP% \
s3://my-output-bucket/ \
s3://my-phi-bucket/ \
--task example__nlp_gpt4
--task example_nlp__nlp_gpt4
```

(If this were a real study, you'd probably do this a bit differently.
Expand All @@ -92,7 +92,7 @@ But for this run-through, we're going to hand-wave all the document selection pi
Whenever you write a new table to S3, you'll want to run your AWS Glue crawler again,
so that the table's schema gets set correctly in Athena.

First, confirm that your AWS Cloud Formation templates have the `example__nlp_*` tables
First, confirm that your AWS Cloud Formation templates have the `example_nlp__nlp_*` tables
configured in them. If not, try copying the Glue crawler definition from
[the sample template we provide](../setup/aws.md).

Expand All @@ -108,7 +108,7 @@ Cumulus workgroup and database.
Then if you make a query like below (assuming you used the GPT4 model),
you should see eight results with extracted ages.
```sql
select * from example__nlp_gpt4
select * from example_nlp__nlp_gpt4
```

**Congratulations!**
Expand Down
10 changes: 5 additions & 5 deletions docs/setup/cumulus-aws-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -212,11 +212,11 @@ Resources:
CreateNativeDeltaTable: True
WriteManifest: False
- DeltaTables:
- !Sub "s3://${S3Bucket}/${EtlSubdir}/example__nlp_gpt4"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/example__nlp_gpt4o"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/example__nlp_gpt5"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/example__nlp_gpt_oss_120b"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/example__nlp_llama4_scout"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/example_nlp__nlp_gpt4"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/example_nlp__nlp_gpt4o"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/example_nlp__nlp_gpt5"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/example_nlp__nlp_gpt_oss_120b"
- !Sub "s3://${S3Bucket}/${EtlSubdir}/example_nlp__nlp_llama4_scout"
CreateNativeDeltaTable: True
WriteManifest: False

Expand Down
12 changes: 6 additions & 6 deletions tests/nlp/test_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ def default_content(self) -> pydantic.BaseModel:
return AgeMention(has_mention=True, spans=["year-old"], age=20)

@ddt.data(
"example__nlp_gpt_oss_120b",
"example__nlp_gpt4",
"example__nlp_gpt4o",
"example__nlp_gpt5",
"example__nlp_llama4_scout",
"example_nlp__nlp_gpt_oss_120b",
"example_nlp__nlp_gpt4",
"example_nlp__nlp_gpt4o",
"example_nlp__nlp_gpt5",
"example_nlp__nlp_llama4_scout",
)
async def test_basic_etl(self, task_name):
for _ in range(8):
self.mock_response()
await self.run_etl(tasks=[task_name], input_path="%EXAMPLE%")
await self.run_etl(tasks=[task_name], input_path="%EXAMPLE-NLP%")
self.assertEqual(self.mock_create.call_count, 8)