Skip to content

Commit a157398

Browse files
committed
feat!: add --select-by-* options to upload-notes, remove downloading
This adds the --select-by-word, -regex, -athena-table, -csv, and -anon-csv options to upload-notes, deprecating the previous --docrefs and --anon-docrefs options (which now map to by-csv and by-anon-csv). This also removes the rarely used "download notes from your EHR on the fly" feature which meant you didn't need local copies of the notes. It could even do a bulk export if you didn't provide a --docrefs flag. It is too hard to square with the new set of options and is a lot of complexity for a feature we don't really use and don't really recommend to sites to use (you should be archiving your notes for easy access anyway). Downloading the wrapped clinical notes referred to by your NDJSON is still supported! This is just removing the ability to go direct to your EHR for the DocRefs and DxReports themselves.
1 parent b89f098 commit a157398

File tree

10 files changed

+75
-482
lines changed

10 files changed

+75
-482
lines changed

cumulus_etl/deid/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""De-identification support"""
22

3-
from .codebook import Codebook
3+
from .codebook import Codebook, FilterFunc
44
from .mstool import MSTOOL_CMD
55
from .philter import Philter
66
from .scrubber import Scrubber

cumulus_etl/deid/codebook.py

Lines changed: 5 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import os
77
import secrets
88
import uuid
9-
from collections.abc import Iterable, Iterator
9+
from collections.abc import Awaitable, Callable, Iterable, Iterator
1010

1111
from cumulus_etl import common
1212

@@ -80,22 +80,6 @@ def fake_id(self, resource_type: str | None, real_id: str, caching_allowed: bool
8080
else:
8181
return self.db.resource_hash(real_id)
8282

83-
def real_ids(self, resource_type: str, fake_ids: Iterable[str]) -> Iterator[str]:
84-
"""
85-
Reverse-maps a list of fake IDs into real IDs.
86-
87-
This is an expensive operation, so only a bulk API is provided.
88-
"""
89-
mapping = self.db.get_reverse_mapping(resource_type)
90-
for fake_id in fake_ids:
91-
real_id = mapping.get(fake_id)
92-
if real_id:
93-
yield real_id
94-
else:
95-
logging.warning(
96-
"Real ID not found for anonymous %s ID %s. Ignoring.", resource_type, fake_id
97-
)
98-
9983

10084
###############################################################################
10185
#
@@ -225,21 +209,6 @@ def _preserved_resource_hash(
225209

226210
return fake_id
227211

228-
def get_reverse_mapping(self, resource_type: str) -> dict[str, str]:
229-
"""
230-
Returns reversed cached mappings for a given resource.
231-
232-
This is used for reverse-engineering anonymous IDs to the original real IDs, for the resources we cache.
233-
"""
234-
mapping = self.cached_mapping.get(resource_type, {})
235-
reverse_mapping = {v: k for k, v in mapping.items()}
236-
237-
# Add any legacy mappings from settings (iteratively, to avoid a spare version in memory)
238-
for k, v in self.settings.get(resource_type, {}).items():
239-
reverse_mapping[v] = k
240-
241-
return reverse_mapping
242-
243212
def resource_hash(self, real_id: str) -> str:
244213
"""
245214
Get a fake ID for an arbitrary FHIR resource ID
@@ -305,3 +274,7 @@ def save(self) -> bool:
305274
saved = True
306275

307276
return saved
277+
278+
279+
# Used for filtering note resource types (like DocRefs or DxReports)
280+
FilterFunc = Callable[[Codebook, dict], Awaitable[bool]] | None

cumulus_etl/etl/config.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,12 @@
22

33
import datetime
44
import os
5-
from collections.abc import Awaitable, Callable
65
from socket import gethostname
76

87
import cumulus_fhir_support as cfs
98

109
from cumulus_etl import common, deid, errors, formats, store
1110

12-
FilterFunc = Callable[[deid.Codebook, dict], Awaitable[bool]] | None
13-
1411

1512
class JobConfig:
1613
"""
@@ -42,7 +39,7 @@ def __init__(
4239
export_datetime: datetime.datetime | None = None,
4340
export_url: str | None = None,
4441
deleted_ids: dict[str, set[str]] | None = None,
45-
resource_filter: FilterFunc = None,
42+
resource_filter: deid.FilterFunc = None,
4643
):
4744
self._dir_input_orig = dir_input_orig
4845
self.dir_input = dir_input_deid

cumulus_etl/nlp/selection.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,9 @@
88
import pyathena
99

1010
from cumulus_etl import cli_utils, deid, errors, fhir, id_handling
11-
from cumulus_etl.etl import config
1211

1312

14-
def add_note_selection(parser: argparse.ArgumentParser) -> None:
13+
def add_note_selection(parser: argparse.ArgumentParser):
1514
group = parser.add_argument_group("note selection")
1615
group.add_argument(
1716
"--select-by-word",
@@ -47,6 +46,7 @@ def add_note_selection(parser: argparse.ArgumentParser) -> None:
4746
action="store_true",
4847
help="allow a larger-than-normal selection",
4948
)
49+
return group
5050

5151

5252
def query_athena_table(table: str, args) -> str:
@@ -134,7 +134,7 @@ def get_match(
134134
return self._id_pools[res_type].get(res_id)
135135

136136

137-
def _define_csv_filter(csv_file: str, is_anon: bool) -> config.FilterFunc:
137+
def _define_csv_filter(csv_file: str, is_anon: bool) -> deid.FilterFunc:
138138
matcher = CsvMatcher(csv_file, is_anon=is_anon)
139139

140140
async def check_match(codebook, res):
@@ -145,7 +145,7 @@ async def check_match(codebook, res):
145145

146146
def _define_regex_filter(
147147
client: cfs.FhirClient, words: list[str] | None, regexes: list[str] | None
148-
) -> config.FilterFunc:
148+
) -> deid.FilterFunc:
149149
patterns = []
150150
if regexes:
151151
patterns.extend(cli_utils.user_regex_to_pattern(regex).pattern for regex in regexes)
@@ -165,7 +165,7 @@ async def res_filter(codebook: deid.Codebook, resource: dict) -> bool:
165165
return res_filter
166166

167167

168-
def get_note_filter(client: cfs.FhirClient, args: argparse.Namespace) -> config.FilterFunc:
168+
def get_note_filter(client: cfs.FhirClient, args: argparse.Namespace) -> deid.FilterFunc:
169169
"""Returns (patient refs to match, resource refs to match)"""
170170
# Confirm we don't have conflicting arguments. Which we could maybe combine, as a future
171171
# improvement, but is too much hassle right now)

cumulus_etl/upload_notes/cli.py

Lines changed: 13 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from ctakesclient.typesystem import Polarity
1212

1313
from cumulus_etl import cli_utils, common, deid, errors, fhir, nlp, store
14-
from cumulus_etl.upload_notes import downloader, labeling, selector
14+
from cumulus_etl.upload_notes import labeling, selector
1515
from cumulus_etl.upload_notes.labelstudio import LabelStudioClient, LabelStudioNote
1616

1717
PHILTER_DISABLE = "disable"
@@ -44,31 +44,14 @@ async def gather_resources(
4444
"""Selects and downloads just the docrefs we need to an export folder."""
4545
common.print_header("Gathering documents...")
4646

47-
# There are three possibilities: we have real IDs, fake IDs, or neither.
48-
# Note that we don't support providing both real & fake IDs right now. It's not clear that would be useful.
49-
if args.docrefs and args.anon_docrefs:
50-
errors.fatal(
51-
"You cannot use both --docrefs and --anon-docrefs at the same time.",
52-
errors.ARGS_CONFLICT,
53-
)
47+
note_filter = nlp.get_note_filter(client, args)
5448

55-
if root_input.protocol == "https": # is this a FHIR server?
56-
return await downloader.download_resources_from_fhir_server(
57-
client,
58-
root_input,
59-
codebook,
60-
id_file=args.docrefs,
61-
anon_id_file=args.anon_docrefs,
62-
export_to=args.export_to,
63-
)
64-
else:
65-
return selector.select_resources_from_files(
66-
root_input,
67-
codebook,
68-
id_file=args.docrefs,
69-
anon_id_file=args.anon_docrefs,
70-
export_to=args.export_to,
71-
)
49+
return await selector.select_resources_from_files(
50+
root_input,
51+
codebook,
52+
note_filter=note_filter,
53+
export_to=args.export_to,
54+
)
7255

7356

7457
def datetime_from_resource(resource: dict) -> datetime.datetime | None:
@@ -418,15 +401,10 @@ def define_upload_notes_parser(parser: argparse.ArgumentParser) -> None:
418401
"(must have note ID, label, and span columns)",
419402
)
420403

421-
docs = parser.add_argument_group("note selection")
422-
docs.add_argument(
423-
"--anon-docrefs",
424-
metavar="PATH",
425-
help="CSV file with anonymized patient_id,docref_id columns",
426-
)
427-
docs.add_argument(
428-
"--docrefs", metavar="PATH", help="CSV file with a docref_id column of original IDs"
429-
)
404+
group = nlp.add_note_selection(parser)
405+
# Add some deprecated aliases for some note selection options. Deprecated since Sep 2025.
406+
group.add_argument("--anon-docrefs", dest="select_by_anon_csv", help=argparse.SUPPRESS)
407+
group.add_argument("--docrefs", dest="select_by_csv", help=argparse.SUPPRESS)
430408

431409
group = parser.add_argument_group("NLP")
432410
cli_utils.add_ctakes_override(group)
@@ -478,6 +456,7 @@ async def upload_notes_main(args: argparse.Namespace) -> None:
478456

479457
args.dir_input = cli_utils.process_input_dir(args.dir_input)
480458
root_input = store.Root(args.dir_input)
459+
store.Root(args.dir_phi, create=True) # create PHI if needed (very edge case)
481460

482461
# Auth & read files early for quick error feedback
483462
client = fhir.create_fhir_client_for_cli(

cumulus_etl/upload_notes/downloader.py

Lines changed: 0 additions & 157 deletions
This file was deleted.

0 commit comments

Comments
 (0)