smart-on-fhir · mikix · Sep 2, 2025 · Sep 2, 2025
diff --git a/cumulus_etl/deid/__init__.py b/cumulus_etl/deid/__init__.py
@@ -1,6 +1,6 @@
 """De-identification support"""
 
-from .codebook import Codebook
+from .codebook import Codebook, FilterFunc
 from .mstool import MSTOOL_CMD
 from .philter import Philter
 from .scrubber import Scrubber
diff --git a/cumulus_etl/deid/codebook.py b/cumulus_etl/deid/codebook.py
@@ -2,11 +2,10 @@
 
 import binascii
 import hmac
-import logging
 import os
 import secrets
 import uuid
-from collections.abc import Iterable, Iterator
+from collections.abc import Awaitable, Callable
 
 from cumulus_etl import common
 
@@ -80,22 +79,6 @@ def fake_id(self, resource_type: str | None, real_id: str, caching_allowed: bool
         else:
             return self.db.resource_hash(real_id)
 
-    def real_ids(self, resource_type: str, fake_ids: Iterable[str]) -> Iterator[str]:
-        """
-        Reverse-maps a list of fake IDs into real IDs.
-
-        This is an expensive operation, so only a bulk API is provided.
-        """
-        mapping = self.db.get_reverse_mapping(resource_type)
-        for fake_id in fake_ids:
-            real_id = mapping.get(fake_id)
-            if real_id:
-                yield real_id
-            else:
-                logging.warning(
-                    "Real ID not found for anonymous %s ID %s. Ignoring.", resource_type, fake_id
-                )
-
 
 ###############################################################################
 #
@@ -225,21 +208,6 @@ def _preserved_resource_hash(
 
         return fake_id
 
-    def get_reverse_mapping(self, resource_type: str) -> dict[str, str]:
-        """
-        Returns reversed cached mappings for a given resource.
-
-        This is used for reverse-engineering anonymous IDs to the original real IDs, for the resources we cache.
-        """
-        mapping = self.cached_mapping.get(resource_type, {})
-        reverse_mapping = {v: k for k, v in mapping.items()}
-
-        # Add any legacy mappings from settings (iteratively, to avoid a spare version in memory)
-        for k, v in self.settings.get(resource_type, {}).items():
-            reverse_mapping[v] = k
-
-        return reverse_mapping
-
     def resource_hash(self, real_id: str) -> str:
         """
         Get a fake ID for an arbitrary FHIR resource ID
@@ -305,3 +273,7 @@ def save(self) -> bool:
             saved = True
 
         return saved
+
+
+# Used for filtering note resource types (like DocRefs or DxReports)
+FilterFunc = Callable[[Codebook, dict], Awaitable[bool]] | None
diff --git a/cumulus_etl/etl/config.py b/cumulus_etl/etl/config.py
@@ -2,15 +2,12 @@
 
 import datetime
 import os
-from collections.abc import Awaitable, Callable
 from socket import gethostname
 
 import cumulus_fhir_support as cfs
 
 from cumulus_etl import common, deid, errors, formats, store
 
-FilterFunc = Callable[[deid.Codebook, dict], Awaitable[bool]] | None
-
 
 class JobConfig:
     """
@@ -42,7 +39,7 @@ def __init__(
         export_datetime: datetime.datetime | None = None,
         export_url: str | None = None,
         deleted_ids: dict[str, set[str]] | None = None,
-        resource_filter: FilterFunc = None,
+        resource_filter: deid.FilterFunc = None,
     ):
         self._dir_input_orig = dir_input_orig
         self.dir_input = dir_input_deid

diff --git a/cumulus_etl/nlp/selection.py b/cumulus_etl/nlp/selection.py
@@ -8,10 +8,9 @@
 import pyathena
 
 from cumulus_etl import cli_utils, deid, errors, fhir, id_handling
-from cumulus_etl.etl import config
 
 
-def add_note_selection(parser: argparse.ArgumentParser) -> None:
+def add_note_selection(parser: argparse.ArgumentParser):
     group = parser.add_argument_group("note selection")
     group.add_argument(
         "--select-by-word",
@@ -47,6 +46,7 @@ def add_note_selection(parser: argparse.ArgumentParser) -> None:
         action="store_true",
         help="allow a larger-than-normal selection",
     )
+    return group
 
 
 def query_athena_table(table: str, args) -> str:
@@ -134,7 +134,7 @@ def get_match(
         return self._id_pools[res_type].get(res_id)
 
 
-def _define_csv_filter(csv_file: str, is_anon: bool) -> config.FilterFunc:
+def _define_csv_filter(csv_file: str, is_anon: bool) -> deid.FilterFunc:
     matcher = CsvMatcher(csv_file, is_anon=is_anon)
 
     async def check_match(codebook, res):
@@ -145,7 +145,7 @@ async def check_match(codebook, res):
 
 def _define_regex_filter(
     client: cfs.FhirClient, words: list[str] | None, regexes: list[str] | None
-) -> config.FilterFunc:
+) -> deid.FilterFunc:
     patterns = []
     if regexes:
         patterns.extend(cli_utils.user_regex_to_pattern(regex).pattern for regex in regexes)
@@ -165,7 +165,7 @@ async def res_filter(codebook: deid.Codebook, resource: dict) -> bool:
     return res_filter
 
 
-def get_note_filter(client: cfs.FhirClient, args: argparse.Namespace) -> config.FilterFunc:
+def get_note_filter(client: cfs.FhirClient, args: argparse.Namespace) -> deid.FilterFunc:
     """Returns (patient refs to match, resource refs to match)"""
     # Confirm we don't have conflicting arguments. Which we could maybe combine, as a future
     # improvement, but is too much hassle right now)

diff --git a/cumulus_etl/upload_notes/cli.py b/cumulus_etl/upload_notes/cli.py
@@ -11,7 +11,7 @@
 from ctakesclient.typesystem import Polarity
 
 from cumulus_etl import cli_utils, common, deid, errors, fhir, nlp, store
-from cumulus_etl.upload_notes import downloader, labeling, selector
+from cumulus_etl.upload_notes import labeling, selector
 from cumulus_etl.upload_notes.labelstudio import LabelStudioClient, LabelStudioNote
 
 PHILTER_DISABLE = "disable"
@@ -44,31 +44,14 @@ async def gather_resources(
     """Selects and downloads just the docrefs we need to an export folder."""
     common.print_header("Gathering documents...")
 
-    # There are three possibilities: we have real IDs, fake IDs, or neither.
-    # Note that we don't support providing both real & fake IDs right now. It's not clear that would be useful.
-    if args.docrefs and args.anon_docrefs:
-        errors.fatal(
-            "You cannot use both --docrefs and --anon-docrefs at the same time.",
-            errors.ARGS_CONFLICT,
-        )
+    note_filter = nlp.get_note_filter(client, args)
 
-    if root_input.protocol == "https":  # is this a FHIR server?
-        return await downloader.download_resources_from_fhir_server(
-            client,
-            root_input,
-            codebook,
-            id_file=args.docrefs,
-            anon_id_file=args.anon_docrefs,
-            export_to=args.export_to,
-        )
-    else:
-        return selector.select_resources_from_files(
-            root_input,
-            codebook,
-            id_file=args.docrefs,
-            anon_id_file=args.anon_docrefs,
-            export_to=args.export_to,
-        )
+    return await selector.select_resources_from_files(
+        root_input,
+        codebook,
+        note_filter=note_filter,
+        export_to=args.export_to,
+    )
 
 
 def datetime_from_resource(resource: dict) -> datetime.datetime | None:
@@ -418,15 +401,10 @@ def define_upload_notes_parser(parser: argparse.ArgumentParser) -> None:
         "(must have note ID, label, and span columns)",
     )
 
-    docs = parser.add_argument_group("note selection")
-    docs.add_argument(
-        "--anon-docrefs",
-        metavar="PATH",
-        help="CSV file with anonymized patient_id,docref_id columns",
-    )
-    docs.add_argument(
-        "--docrefs", metavar="PATH", help="CSV file with a docref_id column of original IDs"
-    )
+    group = nlp.add_note_selection(parser)
+    # Add some deprecated aliases for some note selection options. Deprecated since Sep 2025.
+    group.add_argument("--anon-docrefs", dest="select_by_anon_csv", help=argparse.SUPPRESS)
+    group.add_argument("--docrefs", dest="select_by_csv", help=argparse.SUPPRESS)
 
     group = parser.add_argument_group("NLP")
     cli_utils.add_ctakes_override(group)
@@ -478,6 +456,7 @@ async def upload_notes_main(args: argparse.Namespace) -> None:
 
     args.dir_input = cli_utils.process_input_dir(args.dir_input)
     root_input = store.Root(args.dir_input)
+    store.Root(args.dir_phi, create=True)  # create PHI if needed (very edge case)
 
     # Auth & read files early for quick error feedback
     client = fhir.create_fhir_client_for_cli(

diff --git a/cumulus_etl/upload_notes/downloader.py b/cumulus_etl/upload_notes/downloader.py