Skip to content

Commit 9f18305

Browse files
committed
etl: search recursively for input ndjson files
1 parent 7e487d2 commit 9f18305

File tree

5 files changed

+23
-4
lines changed

5 files changed

+23
-4
lines changed

cumulus_etl/common.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,9 @@ def get_temp_dir(subdir: str) -> str:
7878

7979

8080
def ls_resources(root: store.Root, resources: set[str], warn_if_empty: bool = False) -> list[str]:
81-
found_files = cfs.list_multiline_json_in_dir(root.path, resources, fsspec_fs=root.fs)
81+
found_files = cfs.list_multiline_json_in_dir(
82+
root.path, resources, fsspec_fs=root.fs, recursive=True
83+
)
8284

8385
if warn_if_empty:
8486
# Invert the {path: type} found_files dictionary into {type: [paths...]}

cumulus_etl/inliner/inliner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ async def inliner(
5555
mimetypes = set(mimetypes)
5656

5757
# Grab files to read for the given resources
58-
found_files = cfs.list_multiline_json_in_dir(in_root.path, resources, fsspec_fs=in_root.fs)
58+
found_files = common.ls_resources(in_root, resources)
5959

6060
# Predict how much work we'll have to do by getting counts of lines and files
6161
if in_root.protocol == "file":

cumulus_etl/loaders/fhir/ndjson_loader.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,9 @@ async def detect_resources(self) -> set[str] | None:
5656
# Returning None means "dunno" (i.e. "just accept whatever you eventually get").
5757
return None
5858

59-
found_files = cfs.list_multiline_json_in_dir(self.root.path, fsspec_fs=self.root.fs)
59+
found_files = cfs.list_multiline_json_in_dir(
60+
self.root.path, fsspec_fs=self.root.fs, recursive=True
61+
)
6062
return {resource for resource in found_files.values() if resource}
6163

6264
async def load_resources(self, resources: set[str]) -> base.LoaderResults:

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ requires-python = ">= 3.11"
44
dependencies = [
55
"aiobotocore < 2.22.0", # FIXME: temp hotfix for dependency version madness - remove later
66
"ctakesclient >= 5.1",
7-
"cumulus-fhir-support >= 1.5",
7+
"cumulus-fhir-support >= 1.6",
88
"delta-spark >= 4, < 5",
99
"fsspec[http,s3]",
1010
"httpx",

tests/etl/test_etl_cli.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,21 @@ async def test_rejects_new_phi_dir(self):
381381
await self.run_etl(tasks=["patient"], phi_path=self.make_tempdir())
382382
self.assertEqual(cm.exception.code, errors.WRONG_PHI_FOLDER)
383383

384+
async def test_looks_recursively_for_input(self):
385+
with tempfile.TemporaryDirectory() as tmpdir:
386+
os.makedirs(f"{tmpdir}/subdir1/subdir2")
387+
with open(f"{tmpdir}/subdir1/subdir2/patients.ndjson", "w", encoding="utf8") as f:
388+
json.dump({"resourceType": "Patient", "id": "pat1"}, f)
389+
await self.run_etl(tasks=["patient"], input_path=tmpdir)
390+
391+
self.assertEqual(
392+
common.read_json(f"{self.output_path}/patient/patient.000.ndjson"),
393+
{
394+
"resourceType": "Patient",
395+
"id": "50ffe70a1bdf3b6e73adac15e4ab7f9d7e247466d7a6c395c2ae9098741a62bd",
396+
},
397+
)
398+
384399

385400
class TestEtlJobConfig(BaseEtlSimple):
386401
"""Test case for the job config logging data"""

0 commit comments

Comments
 (0)