Skip to content

Commit eb1153f

Browse files
committed
feat: add new init subcommand
This will create an empty table for each basic resource that we support. This is a convenience command to get up and running faster or to refresh your database schemas in case we change them in the future.
1 parent 3256616 commit eb1153f

File tree

9 files changed

+150
-12
lines changed

9 files changed

+150
-12
lines changed

cumulus_etl/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""Turns FHIR data into de-identified & aggregated records"""
22

3-
__version__ = "1.4.0"
3+
__version__ = "1.5.0"

cumulus_etl/cli.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import rich.logging
1111

1212
from cumulus_etl import common, etl, export, upload_notes
13-
from cumulus_etl.etl import convert
13+
from cumulus_etl.etl import convert, init
1414

1515

1616
class Command(enum.Enum):
@@ -23,6 +23,7 @@ class Command(enum.Enum):
2323
CONVERT = "convert"
2424
ETL = "etl"
2525
EXPORT = "export"
26+
INIT = "init"
2627
UPLOAD_NOTES = "upload-notes"
2728

2829
# Why isn't this part of Enum directly...?
@@ -70,13 +71,15 @@ async def main(argv: list[str]) -> None:
7071
run_method = convert.run_convert
7172
elif subcommand == Command.EXPORT.value:
7273
run_method = export.run_export
74+
elif subcommand == Command.INIT.value:
75+
run_method = init.run_init
7376
else:
7477
parser.description = "Extract, transform, and load FHIR data."
7578
if not subcommand:
7679
# Add a note about other subcommands we offer, and tell argparse not to wrap our formatting
7780
parser.formatter_class = argparse.RawDescriptionHelpFormatter
7881
parser.description += "\n\nother commands available:\n"
79-
parser.description += " convert\n export\n upload-notes"
82+
parser.description += " convert\n export\n init\n upload-notes"
8083
run_method = etl.run_etl
8184

8285
with tempfile.TemporaryDirectory() as tempdir:

cumulus_etl/cli_utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,15 @@ def add_nlp(parser: argparse.ArgumentParser):
6969
return group
7070

7171

72+
def add_output_format(parser: argparse.ArgumentParser) -> None:
73+
parser.add_argument(
74+
"--output-format",
75+
default="deltalake",
76+
choices=["deltalake", "ndjson"],
77+
help="output format (default is deltalake)",
78+
)
79+
80+
7281
def add_task_selection(parser: argparse.ArgumentParser):
7382
task = parser.add_argument_group("task selection")
7483
task.add_argument(

cumulus_etl/etl/cli.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -102,12 +102,7 @@ def define_etl_parser(parser: argparse.ArgumentParser) -> None:
102102
choices=["i2b2", "ndjson"],
103103
help="input format (default is ndjson)",
104104
)
105-
parser.add_argument(
106-
"--output-format",
107-
default="deltalake",
108-
choices=["deltalake", "ndjson"],
109-
help="output format (default is deltalake)",
110-
)
105+
cli_utils.add_output_format(parser)
111106
parser.add_argument(
112107
"--batch-size",
113108
type=int,

cumulus_etl/etl/init/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""Subcommand to initialize basic tables"""
2+
3+
from .cli import run_init

cumulus_etl/etl/init/cli.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""
2+
Initializes basic resource tables.
3+
4+
Creates the tables if they don't exist and pushes up a basic schema.
5+
"""
6+
7+
import argparse
8+
from collections.abc import Iterable
9+
10+
from cumulus_etl import cli_utils, formats, store
11+
from cumulus_etl.etl import tasks
12+
from cumulus_etl.etl.tasks import task_factory
13+
14+
15+
def define_init_parser(parser: argparse.ArgumentParser) -> None:
16+
parser.usage = "%(prog)s [OPTION]... OUTPUT"
17+
parser.description = (
18+
"Initialize all basic output tables. "
19+
"After this command is run, you will be ready to set up Cumulus Library. "
20+
"This command is safe to run multiple times on the same folder, "
21+
"or even on an existing folder with data already in it."
22+
)
23+
24+
parser.add_argument("dir_output", metavar="/path/to/output")
25+
cli_utils.add_output_format(parser)
26+
27+
cli_utils.add_aws(parser)
28+
29+
30+
def get_task_tables() -> Iterable[tuple[type[tasks.EtlTask], tasks.OutputTable]]:
31+
for task_class in task_factory.get_default_tasks():
32+
for output in task_class.outputs:
33+
if not output.get_name(task_class).startswith("etl__"):
34+
yield task_class, output
35+
36+
37+
async def init_main(args: argparse.Namespace) -> None:
38+
"""Main logic for initialization"""
39+
# record filesystem options like --s3-region before creating Roots
40+
store.set_user_fs_options(vars(args))
41+
42+
output_root = store.Root(args.dir_output)
43+
44+
with cli_utils.make_progress_bar() as progress:
45+
# Set up progress bar
46+
total_steps = len(list(get_task_tables())) + 1 # extra 1 is initializing the formatter
47+
task = progress.add_task("Initializing tables", total=total_steps)
48+
49+
# Initialize formatter (which can take a moment with deltalake)
50+
format_class = formats.get_format_class(args.output_format)
51+
format_class.initialize_class(output_root)
52+
progress.update(task, advance=1)
53+
54+
# Create an empty JobConfig/ folder, so that the 'convert' command will recognize this
55+
# folder as an ETL folder.
56+
output_root.makedirs(output_root.joinpath("JobConfig"))
57+
58+
# Now iterate through, pushing to each output table
59+
for task_class, output in get_task_tables():
60+
batch = task_class.make_batch_from_rows(output.get_resource_type(task_class), [])
61+
formatter = format_class(output_root, output.get_name(task_class))
62+
formatter.write_records(batch)
63+
progress.update(task, advance=1)
64+
65+
66+
async def run_init(parser: argparse.ArgumentParser, argv: list[str]) -> None:
67+
"""Parse arguments and do the work"""
68+
define_init_parser(parser)
69+
args = parser.parse_args(argv)
70+
await init_main(args)

docs/setup/sample-runs.md

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,17 +135,33 @@ Congratulations! You've run your first Cumulus ETL process. The first of many!
135135

136136
### AWS Test Run
137137

138-
Let's do the same thing, but now pointing at S3 buckets.
138+
Let's do that again, but now pointing at S3 buckets.
139139
This assumes you've followed the [S3 setup guide](aws.md).
140140

141+
We didn't do this above, but now that we're getting more serious,
142+
let's run `cumulus-etl init` first, which will create all the basic tables for us.
143+
141144
When using S3 buckets, you'll need to set the `--s3-region` argument to the correct region.
142145

143-
Run this command, but replace:
146+
Run the command below, but replace:
144147
* `us-east-2` with the region your buckets are in
145148
* `99999999999` with your account ID
146149
* `my-cumulus-prefix` with the bucket prefix you used when setting up AWS
147150
* and `subdir1` with the ETL subdirectory you used when setting up AWS
148151

152+
```sh
153+
docker compose -f $CUMULUS_REPO_PATH/compose.yaml \
154+
run --rm \
155+
cumulus-etl init \
156+
--s3-region=us-east-2 \
157+
s3://my-cumulus-prefix-99999999999-us-east-2/subdir1/
158+
```
159+
160+
This will create empty tables for all the core resources that Cumulus works with.
161+
You should now even be able to see some (very small) output files in your S3 buckets!
162+
163+
Let's go one step further and put some actual (fake) test data in there too.
164+
149165
```sh
150166
docker compose -f $CUMULUS_REPO_PATH/compose.yaml \
151167
run --volume $CUMULUS_REPO_PATH:/cumulus-etl --rm \
@@ -156,7 +172,8 @@ docker compose -f $CUMULUS_REPO_PATH/compose.yaml \
156172
s3://my-cumulus-prefix-phi-99999999999-us-east-2/subdir1/
157173
```
158174

159-
You should now be able to see some (very small) output files in your S3 buckets!
175+
(Though, note now that your S3 bucket has test data in it.
176+
Before you put any real data in there, you should delete the S3 folder and start fresh.)
160177

161178
Obviously, this was just example data.
162179
But if you'd prefer to keep PHI off of AWS when you deploy for real,

tests/init/__init__.py

Whitespace-only changes.

tests/init/test_init_cli.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""Tests for etl/init/cli.py"""
2+
3+
import os
4+
5+
import ddt
6+
7+
from cumulus_etl import cli, common
8+
from tests import utils
9+
10+
11+
@ddt.ddt
12+
class TestInit(utils.AsyncTestCase):
13+
"""Tests for high-level init support."""
14+
15+
def setUp(self):
16+
super().setUp()
17+
self.output_path = self.make_tempdir()
18+
19+
async def run_init(self, output_path: str | None = None) -> None:
20+
args = [
21+
"init",
22+
output_path or self.output_path,
23+
"--output-format=ndjson",
24+
]
25+
await cli.main(args)
26+
27+
async def test_happy_path(self):
28+
"""Verify that we can do a simple init"""
29+
await self.run_init()
30+
31+
# Do some spot checks
32+
dirs = set(os.listdir(self.output_path))
33+
self.assertIn("device", dirs)
34+
self.assertIn("patient", dirs)
35+
self.assertIn("medicationrequest", dirs)
36+
self.assertIn("medication", dirs) # secondary table
37+
self.assertIn("JobConfig", dirs) # so that the dir is flagged as an ETL dir by 'convert'
38+
39+
# Are folder contents what we expect?
40+
self.assertEqual(["patient.000.ndjson"], os.listdir(f"{self.output_path}/patient"))
41+
self.assertEqual("", common.read_text(f"{self.output_path}/patient/patient.000.ndjson"))

0 commit comments

Comments
 (0)