|
11 | 11 | from ctakesclient.typesystem import Polarity |
12 | 12 |
|
13 | 13 | from cumulus_etl import cli_utils, common, deid, errors, fhir, nlp, store |
14 | | -from cumulus_etl.upload_notes import downloader, labeling, selector |
| 14 | +from cumulus_etl.upload_notes import labeling, selector |
15 | 15 | from cumulus_etl.upload_notes.labelstudio import LabelStudioClient, LabelStudioNote |
16 | 16 |
|
17 | 17 | PHILTER_DISABLE = "disable" |
@@ -44,31 +44,14 @@ async def gather_resources( |
44 | 44 | """Selects and downloads just the docrefs we need to an export folder.""" |
45 | 45 | common.print_header("Gathering documents...") |
46 | 46 |
|
47 | | - # There are three possibilities: we have real IDs, fake IDs, or neither. |
48 | | - # Note that we don't support providing both real & fake IDs right now. It's not clear that would be useful. |
49 | | - if args.docrefs and args.anon_docrefs: |
50 | | - errors.fatal( |
51 | | - "You cannot use both --docrefs and --anon-docrefs at the same time.", |
52 | | - errors.ARGS_CONFLICT, |
53 | | - ) |
| 47 | + note_filter = nlp.get_note_filter(client, args) |
54 | 48 |
|
55 | | - if root_input.protocol == "https": # is this a FHIR server? |
56 | | - return await downloader.download_resources_from_fhir_server( |
57 | | - client, |
58 | | - root_input, |
59 | | - codebook, |
60 | | - id_file=args.docrefs, |
61 | | - anon_id_file=args.anon_docrefs, |
62 | | - export_to=args.export_to, |
63 | | - ) |
64 | | - else: |
65 | | - return selector.select_resources_from_files( |
66 | | - root_input, |
67 | | - codebook, |
68 | | - id_file=args.docrefs, |
69 | | - anon_id_file=args.anon_docrefs, |
70 | | - export_to=args.export_to, |
71 | | - ) |
| 49 | + return await selector.select_resources_from_files( |
| 50 | + root_input, |
| 51 | + codebook, |
| 52 | + note_filter=note_filter, |
| 53 | + export_to=args.export_to, |
| 54 | + ) |
72 | 55 |
|
73 | 56 |
|
74 | 57 | def datetime_from_resource(resource: dict) -> datetime.datetime | None: |
@@ -418,15 +401,10 @@ def define_upload_notes_parser(parser: argparse.ArgumentParser) -> None: |
418 | 401 | "(must have note ID, label, and span columns)", |
419 | 402 | ) |
420 | 403 |
|
421 | | - docs = parser.add_argument_group("note selection") |
422 | | - docs.add_argument( |
423 | | - "--anon-docrefs", |
424 | | - metavar="PATH", |
425 | | - help="CSV file with anonymized patient_id,docref_id columns", |
426 | | - ) |
427 | | - docs.add_argument( |
428 | | - "--docrefs", metavar="PATH", help="CSV file with a docref_id column of original IDs" |
429 | | - ) |
| 404 | + group = nlp.add_note_selection(parser) |
| 405 | + # Add some deprecated aliases for some note selection options. Deprecated since Sep 2025. |
| 406 | + group.add_argument("--anon-docrefs", dest="select_by_anon_csv", help=argparse.SUPPRESS) |
| 407 | + group.add_argument("--docrefs", dest="select_by_csv", help=argparse.SUPPRESS) |
430 | 408 |
|
431 | 409 | group = parser.add_argument_group("NLP") |
432 | 410 | cli_utils.add_ctakes_override(group) |
@@ -478,6 +456,7 @@ async def upload_notes_main(args: argparse.Namespace) -> None: |
478 | 456 |
|
479 | 457 | args.dir_input = cli_utils.process_input_dir(args.dir_input) |
480 | 458 | root_input = store.Root(args.dir_input) |
| 459 | + root_phi = store.Root(args.dir_phi, create=True) |
481 | 460 |
|
482 | 461 | # Auth & read files early for quick error feedback |
483 | 462 | client = fhir.create_fhir_client_for_cli( |
|
0 commit comments