Skip to content

Commit ef7e6f4

Browse files
committed
feat!: require completion tracking info, write completion by default
- Removes the --write-completion opt-in flag, it is now always enabled. - Requires export group name and timestamp information to be available, either from export log or from CLI. - Updates some user docs, explaining how completion tracking expects to be fed data.
1 parent 3c97d2f commit ef7e6f4

33 files changed

+145
-97
lines changed

.github/workflows/ci.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ jobs:
105105
/in/input \
106106
/in/run-output \
107107
/in/phi \
108+
--export-group nlp-test \
109+
--export-timestamp 2024-08-29 \
108110
--output-format=ndjson \
109111
--task covid_symptom__nlp_results
110112

cumulus_etl/errors.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,10 @@ def __init__(self):
6969
)
7070

7171

72-
def fatal(message: str, status: int) -> NoReturn:
72+
def fatal(message: str, status: int, extra: str = "") -> NoReturn:
7373
"""Convenience method to exit the program with a user-friendly error message a test-friendly status code"""
74-
rich.console.Console(stderr=True).print(message, style="bold red", highlight=False)
74+
stderr = rich.console.Console(stderr=True)
75+
stderr.print(message, style="bold red", highlight=False)
76+
if extra:
77+
stderr.print(rich.padding.Padding.indent(extra, 2), highlight=False)
7578
sys.exit(status) # raises a SystemExit exception

cumulus_etl/etl/cli.py

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -138,11 +138,18 @@ def define_etl_parser(parser: argparse.ArgumentParser) -> None:
138138
)
139139

140140
group = parser.add_argument_group("external export identification")
141-
group.add_argument("--export-group", help=argparse.SUPPRESS)
142-
group.add_argument("--export-timestamp", help=argparse.SUPPRESS)
143-
# Temporary explicit opt-in flag during the development of the completion-tracking feature
144141
group.add_argument(
145-
"--write-completion", action="store_true", default=False, help=argparse.SUPPRESS
142+
"--export-group",
143+
metavar="NAME",
144+
help="name of the FHIR Group that was exported (default is to grab this from an "
145+
"export log file in the input folder, but you can also use this to assign a "
146+
"nickname as long as you consistently set the same nickname)",
147+
)
148+
group.add_argument(
149+
"--export-timestamp",
150+
metavar="TIMESTAMP",
151+
help="when the data was exported from the FHIR Group (default is to grab this from an "
152+
"export log file in the input folder)",
146153
)
147154

148155
cli_utils.add_nlp(parser)
@@ -206,17 +213,26 @@ def handle_completion_args(
206213
else loader_results.export_datetime
207214
)
208215

209-
# Disable entirely if asked to
210-
if not args.write_completion:
211-
export_group_name = None
212-
export_datetime = None
213-
214-
# Error out if we have mismatched args
215-
has_group_name = export_group_name is not None
216-
has_datetime = bool(export_datetime)
217-
if has_group_name and not has_datetime:
216+
# Error out if we have missing args
217+
missing_group_name = export_group_name is None
218+
missing_datetime = not export_datetime
219+
if missing_group_name and missing_datetime:
220+
errors.fatal(
221+
"Missing Group name and timestamp export information for the input data.",
222+
errors.COMPLETION_ARG_MISSING,
223+
extra="This is likely because you don’t have an export log in your input folder.\n"
224+
"This log file (log.ndjson) is generated by some bulk export tools.\n"
225+
"Instead, please manually specify the Group name and timestamp of the export "
226+
"with the --export-group and --export-timestamp options.\n"
227+
"These options are necessary to track whether all the required data from "
228+
"a Group has been imported and is ready to be used.\n"
229+
"See https://docs.smarthealthit.org/cumulus/etl/bulk-exports.html for more "
230+
"information.\n",
231+
)
232+
# These next two errors can be briefer because the user clearly knows about the args.
233+
elif missing_datetime:
218234
errors.fatal("Missing --export-datetime argument.", errors.COMPLETION_ARG_MISSING)
219-
elif not has_group_name and has_datetime:
235+
elif missing_group_name:
220236
errors.fatal("Missing --export-group argument.", errors.COMPLETION_ARG_MISSING)
221237

222238
return export_group_name, export_datetime

cumulus_etl/etl/tasks/base.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,6 @@ def __init__(self, task_config: config.JobConfig, scrubber: deid.Scrubber):
113113
self.summaries: list[config.JobSummary] = [
114114
config.JobSummary(output.get_name(self)) for output in self.outputs
115115
]
116-
self.completion_tracking_enabled = (
117-
self.task_config.export_group_name is not None and self.task_config.export_datetime
118-
)
119116

120117
async def run(self) -> list[config.JobSummary]:
121118
"""
@@ -260,9 +257,6 @@ def _delete_requested_ids(self):
260257
self.formatters[index].delete_records(deleted_ids)
261258

262259
def _update_completion_table(self) -> None:
263-
if not self.completion_tracking_enabled:
264-
return
265-
266260
# Create completion rows
267261
batch = formats.Batch(
268262
rows=[

cumulus_etl/etl/tasks/basic_tasks.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,11 @@ class EncounterTask(tasks.EtlTask):
6262

6363
async def read_entries(self, *, progress: rich.progress.Progress = None) -> tasks.EntryIterator:
6464
async for encounter in super().read_entries(progress=progress):
65-
if self.completion_tracking_enabled:
66-
completion_info = {
67-
"encounter_id": encounter["id"],
68-
"group_name": self.task_config.export_group_name,
69-
"export_time": self.task_config.export_datetime.isoformat(),
70-
}
71-
else:
72-
completion_info = None
73-
65+
completion_info = {
66+
"encounter_id": encounter["id"],
67+
"group_name": self.task_config.export_group_name,
68+
"export_time": self.task_config.export_datetime.isoformat(),
69+
}
7470
yield completion_info, encounter
7571

7672
@classmethod

docs/bulk-exports.md

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,27 +13,33 @@ Cumulus ETL wants data, and lots of it.
1313
It's happy to ingest data that you've gathered elsewhere (as a separate export),
1414
but it's also happy to download the data itself as needed during the ETL (as an on-the-fly export).
1515

16-
## Separate Exports
16+
## Export Options
17+
18+
### External Exports
1719

1820
1. If you have an existing process to export health data, you can do that bulk export externally,
1921
and then just feed the resulting files to Cumulus ETL.
22+
(Though note that you will need to provide some export information manually,
23+
with the `--export-group` and `--export-timestamp` options. See `--help` for more info.)
2024

2125
2. Cumulus ETL has an `export` command to perform just a bulk export without an ETL step.
2226
Run it like so: `cumulus-etl export FHIR_URL ./output` (see `--help` for more options).
23-
You can use all sorts of
27+
- You can use all sorts of
2428
[interesting FHIR options](https://hl7.org/fhir/uv/bulkdata/export.html#query-parameters)
2529
like `_typeFilter` or `_since` in the URL.
30+
- This workflow will generate an export log file, from which Cumulus ETL can pull
31+
some export metadata like the Group name and export timestamp.
2632

2733
3. Or you may need more advanced options than our internal exporter supports.
2834
The [SMART Bulk Data Client](https://github.com/smart-on-fhir/bulk-data-client)
29-
is a great tool with lots of features.
35+
is a great tool with lots of features (and also generates an export log file).
3036

3137
In any case, it's simple to feed that data to the ETL:
3238
1. Pass Cumulus ETL the folder that holds the downloaded data as the input path.
3339
1. Pass `--fhir-url=` pointing at your FHIR server so that externally referenced document notes
3440
and medications can still be downloaded as needed.
3541

36-
## On-The-Fly Exports
42+
### On-The-Fly Exports
3743

3844
If it's easier to just do it all in one step,
3945
you can also start an ETL run with your FHIR URL as the input path.
@@ -44,6 +50,60 @@ You can save the exported files for archiving after the fact with `--export-to=P
4450
However, bulk exports tend to be brittle and slow for many EHRs at the time of this writing.
4551
It might be wiser to separately export, make sure the data is all there and good, and then ETL it.
4652

53+
## Cumulus Assumptions
54+
55+
Cumulus ETL makes some specific assumptions about the data you feed it and the order you feed it in.
56+
57+
This is because Cumulus tracks which resources were exported from which FHIR Groups and when.
58+
It only allows Encounters that have had all their data fully imported to be queried by SQL,
59+
to prevent an in-progress ETL workflow from affecting queries against the database.
60+
(i.e. to prevent an Encounter that hasn't yet had Conditions loaded in from looking like an
61+
Encounter that doesn't _have_ any Conditions)
62+
63+
Of course, even in the normal course of events, resources may show up weeks after an Encounter
64+
(like lab results).
65+
So an Encounter can never knowingly be truly _complete_,
66+
but Cumulus ETL makes an effort to keep a consistent view of the world at least for a given
67+
point in time.
68+
69+
### Encounters First
70+
71+
**Please export Encounters along with or before you export other Encounter-linked resources.**
72+
(Patients can be exported beforehand, since they don't depend on Encounters.)
73+
74+
To prevent incomplete Encounters, Cumulus only looks at Encounters that have an export
75+
timestamp at the same time or before linked resources like Condition.
76+
(As a result, there may be extra Conditions that point to not-yet-loaded Encounters.
77+
But that's fine, they will also be ignored until their Encounters do get loaded.)
78+
79+
If you do export Encounters last, you may not see any of those Encounters in the `core` study
80+
tables once you run Cumulus Library on the data.
81+
(Your Encounter data is safe and sound,
82+
just temporarily ignored by the Library until later exports come through.)
83+
84+
### No Partial Group Exports
85+
86+
**Please don't slice and dice your Group resources when exporting.**
87+
Cumulus ETL assumes that when you feed it an input folder of export files,
88+
that everything in the Group is available (at least, for the exported resources).
89+
You can export one resource from the Group at a time, just don't slice that resource further.
90+
91+
This is because when you run ETL on say, Conditions exported from Group `Group1234`,
92+
it will mark Conditions in `Group1234` as completely loaded (up to the export timestamp).
93+
94+
Using `_since` or a date-oriented `_typeFilter` is still fine, to grab new data for an export.
95+
The concern is more about an incomplete view of the data at a given point in time.
96+
97+
For example, if you sliced Conditions according to category when exporting
98+
(e.g. `_typeFilter=Condition?category=problem-list-item`),
99+
Cumulus will have an incorrect view of the world
100+
(thinking it got all Conditions when it only got problem list items).
101+
102+
You can still do this if you are careful!
103+
For example, maybe exporting Observations is too slow unless you slice by category.
104+
Just make sure that after you export all the Observations separately,
105+
you then combine them again into one big Observation folder before running Cumulus ETL.
106+
47107
## Archiving Exports
48108

49109
Exports can take a long time, and it's often convenient to archive the results.

docs/setup/sample-runs.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,10 @@ docker compose -f $CUMULUS_REPO_PATH/compose.yaml \
199199
s3://my-cumulus-prefix-phi-99999999999-us-east-2/subdir1/
200200
```
201201

202+
(Read [more about bulk exporting](../bulk-exports.md)
203+
to learn how to get some real data from your EHR,
204+
and how to properly feed it into the ETL.)
205+
202206
Now let's talk about customizing this command for your own environment.
203207
(And remember that you can always run `docker compose run cumulus-etl --help` for more guidance.)
204208

@@ -225,7 +229,8 @@ defaults are subject to change or might not match your situation.
225229
* `--output-format`: There are two reasonable values (`ndjson` and `deltalake`).
226230
For production use, you can use the default value of `deltalake` as it supports incremental,
227231
batched updates.
228-
But `ndjson` is useful when debugging as it is human-readable.
232+
But since the `ndjson` output is human-readable, it's useful for debugging
233+
or reviewing the output before pushing to the cloud.
229234

230235
* `--batch-size`: How many resources to save in a single output file. If there are more resources
231236
(e.g. more patients) than this limit, multiple output files will be created for that resource
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"table_name": "encounter", "group_name": "test-group", "export_time": "2020-10-13T12:00:20-05:00", "etl_version": "1.0.0+test", "etl_time": "2021-09-14T21:23:45+00:00"}
1+
{"table_name": "encounter", "group_name": "test-group", "export_time": "2020-10-13T12:00:20-05:00", "export_url": "https://example.org/fhir/$export", "etl_version": "1.0.0+test", "etl_time": "2021-09-14T21:23:45+00:00"}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"table_name": "patient", "group_name": "test-group", "export_time": "2020-10-13T12:00:20-05:00", "etl_version": "1.0.0+test", "etl_time": "2021-09-14T21:23:45+00:00"}
1+
{"table_name": "patient", "group_name": "test-group", "export_time": "2020-10-13T12:00:20-05:00", "export_url": "https://example.org/fhir/$export", "etl_version": "1.0.0+test", "etl_time": "2021-09-14T21:23:45+00:00"}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"table_name": "condition", "group_name": "test-group", "export_time": "2020-10-13T12:00:20-05:00", "etl_version": "1.0.0+test", "etl_time": "2021-09-14T21:23:45+00:00"}
1+
{"table_name": "condition", "group_name": "test-group", "export_time": "2020-10-13T12:00:20-05:00", "export_url": "https://example.org/fhir/$export", "etl_version": "1.0.0+test", "etl_time": "2021-09-14T21:23:45+00:00"}

0 commit comments

Comments
 (0)