Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cumulus_etl/deid/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""De-identification support"""

from .codebook import Codebook
from .codebook import Codebook, FilterFunc
from .mstool import MSTOOL_CMD
from .philter import Philter
from .scrubber import Scrubber
38 changes: 5 additions & 33 deletions cumulus_etl/deid/codebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@

import binascii
import hmac
import logging
import os
import secrets
import uuid
from collections.abc import Iterable, Iterator
from collections.abc import Awaitable, Callable

from cumulus_etl import common

Expand Down Expand Up @@ -80,22 +79,6 @@ def fake_id(self, resource_type: str | None, real_id: str, caching_allowed: bool
else:
return self.db.resource_hash(real_id)

def real_ids(self, resource_type: str, fake_ids: Iterable[str]) -> Iterator[str]:
"""
Reverse-maps a list of fake IDs into real IDs.

This is an expensive operation, so only a bulk API is provided.
"""
mapping = self.db.get_reverse_mapping(resource_type)
for fake_id in fake_ids:
real_id = mapping.get(fake_id)
if real_id:
yield real_id
else:
logging.warning(
"Real ID not found for anonymous %s ID %s. Ignoring.", resource_type, fake_id
)


###############################################################################
#
Expand Down Expand Up @@ -225,21 +208,6 @@ def _preserved_resource_hash(

return fake_id

def get_reverse_mapping(self, resource_type: str) -> dict[str, str]:
"""
Returns reversed cached mappings for a given resource.

This is used for reverse-engineering anonymous IDs to the original real IDs, for the resources we cache.
"""
mapping = self.cached_mapping.get(resource_type, {})
reverse_mapping = {v: k for k, v in mapping.items()}

# Add any legacy mappings from settings (iteratively, to avoid a spare version in memory)
for k, v in self.settings.get(resource_type, {}).items():
reverse_mapping[v] = k

return reverse_mapping

def resource_hash(self, real_id: str) -> str:
"""
Get a fake ID for an arbitrary FHIR resource ID
Expand Down Expand Up @@ -305,3 +273,7 @@ def save(self) -> bool:
saved = True

return saved


# Used for filtering note resource types (like DocRefs or DxReports)
FilterFunc = Callable[[Codebook, dict], Awaitable[bool]] | None
5 changes: 1 addition & 4 deletions cumulus_etl/etl/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,12 @@

import datetime
import os
from collections.abc import Awaitable, Callable
from socket import gethostname

import cumulus_fhir_support as cfs

from cumulus_etl import common, deid, errors, formats, store

FilterFunc = Callable[[deid.Codebook, dict], Awaitable[bool]] | None


class JobConfig:
"""
Expand Down Expand Up @@ -42,7 +39,7 @@ def __init__(
export_datetime: datetime.datetime | None = None,
export_url: str | None = None,
deleted_ids: dict[str, set[str]] | None = None,
resource_filter: FilterFunc = None,
resource_filter: deid.FilterFunc = None,
):
self._dir_input_orig = dir_input_orig
self.dir_input = dir_input_deid
Expand Down
10 changes: 5 additions & 5 deletions cumulus_etl/nlp/selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@
import pyathena

from cumulus_etl import cli_utils, deid, errors, fhir, id_handling
from cumulus_etl.etl import config


def add_note_selection(parser: argparse.ArgumentParser) -> None:
def add_note_selection(parser: argparse.ArgumentParser):
group = parser.add_argument_group("note selection")
group.add_argument(
"--select-by-word",
Expand Down Expand Up @@ -47,6 +46,7 @@ def add_note_selection(parser: argparse.ArgumentParser) -> None:
action="store_true",
help="allow a larger-than-normal selection",
)
return group


def query_athena_table(table: str, args) -> str:
Expand Down Expand Up @@ -134,7 +134,7 @@ def get_match(
return self._id_pools[res_type].get(res_id)


def _define_csv_filter(csv_file: str, is_anon: bool) -> config.FilterFunc:
def _define_csv_filter(csv_file: str, is_anon: bool) -> deid.FilterFunc:
matcher = CsvMatcher(csv_file, is_anon=is_anon)

async def check_match(codebook, res):
Expand All @@ -145,7 +145,7 @@ async def check_match(codebook, res):

def _define_regex_filter(
client: cfs.FhirClient, words: list[str] | None, regexes: list[str] | None
) -> config.FilterFunc:
) -> deid.FilterFunc:
patterns = []
if regexes:
patterns.extend(cli_utils.user_regex_to_pattern(regex).pattern for regex in regexes)
Expand All @@ -165,7 +165,7 @@ async def res_filter(codebook: deid.Codebook, resource: dict) -> bool:
return res_filter


def get_note_filter(client: cfs.FhirClient, args: argparse.Namespace) -> config.FilterFunc:
def get_note_filter(client: cfs.FhirClient, args: argparse.Namespace) -> deid.FilterFunc:
"""Returns (patient refs to match, resource refs to match)"""
# Confirm we don't have conflicting arguments. Which we could maybe combine, as a future
# improvement, but is too much hassle right now)
Expand Down
47 changes: 13 additions & 34 deletions cumulus_etl/upload_notes/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from ctakesclient.typesystem import Polarity

from cumulus_etl import cli_utils, common, deid, errors, fhir, nlp, store
from cumulus_etl.upload_notes import downloader, labeling, selector
from cumulus_etl.upload_notes import labeling, selector
from cumulus_etl.upload_notes.labelstudio import LabelStudioClient, LabelStudioNote

PHILTER_DISABLE = "disable"
Expand Down Expand Up @@ -44,31 +44,14 @@ async def gather_resources(
"""Selects and downloads just the docrefs we need to an export folder."""
common.print_header("Gathering documents...")

# There are three possibilities: we have real IDs, fake IDs, or neither.
# Note that we don't support providing both real & fake IDs right now. It's not clear that would be useful.
if args.docrefs and args.anon_docrefs:
errors.fatal(
"You cannot use both --docrefs and --anon-docrefs at the same time.",
errors.ARGS_CONFLICT,
)
note_filter = nlp.get_note_filter(client, args)

if root_input.protocol == "https": # is this a FHIR server?
return await downloader.download_resources_from_fhir_server(
client,
root_input,
codebook,
id_file=args.docrefs,
anon_id_file=args.anon_docrefs,
export_to=args.export_to,
)
else:
return selector.select_resources_from_files(
root_input,
codebook,
id_file=args.docrefs,
anon_id_file=args.anon_docrefs,
export_to=args.export_to,
)
return await selector.select_resources_from_files(
root_input,
codebook,
note_filter=note_filter,
export_to=args.export_to,
)


def datetime_from_resource(resource: dict) -> datetime.datetime | None:
Expand Down Expand Up @@ -418,15 +401,10 @@ def define_upload_notes_parser(parser: argparse.ArgumentParser) -> None:
"(must have note ID, label, and span columns)",
)

docs = parser.add_argument_group("note selection")
docs.add_argument(
"--anon-docrefs",
metavar="PATH",
help="CSV file with anonymized patient_id,docref_id columns",
)
docs.add_argument(
"--docrefs", metavar="PATH", help="CSV file with a docref_id column of original IDs"
)
group = nlp.add_note_selection(parser)
# Add some deprecated aliases for some note selection options. Deprecated since Sep 2025.
group.add_argument("--anon-docrefs", dest="select_by_anon_csv", help=argparse.SUPPRESS)
group.add_argument("--docrefs", dest="select_by_csv", help=argparse.SUPPRESS)

group = parser.add_argument_group("NLP")
cli_utils.add_ctakes_override(group)
Expand Down Expand Up @@ -478,6 +456,7 @@ async def upload_notes_main(args: argparse.Namespace) -> None:

args.dir_input = cli_utils.process_input_dir(args.dir_input)
root_input = store.Root(args.dir_input)
store.Root(args.dir_phi, create=True) # create PHI if needed (very edge case)

# Auth & read files early for quick error feedback
client = fhir.create_fhir_client_for_cli(
Expand Down
157 changes: 0 additions & 157 deletions cumulus_etl/upload_notes/downloader.py

This file was deleted.

Loading