Skip to content

Commit 3e6a17e

Browse files
authored
Merge pull request #364 from smart-on-fhir/mikix/mask-valuestring
feat(deid): drop valueString instead of passing it through
2 parents 468dcfb + 741fb34 commit 3e6a17e

File tree

6 files changed

+59
-58
lines changed

6 files changed

+59
-58
lines changed

cumulus_etl/deid/ms-config.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@
351351
{"path": "Observation.effective", "method": "keep"},
352352
{"path": "Observation.issued", "method": "keep"},
353353
{"path": "Observation.performer", "method": "keep"},
354-
{"path": "Observation.value", "method": "keep"}, // should run philter on valueString
354+
{"path": "Observation.value", "method": "keep"}, // we drop valueString inside ETL
355355
{"path": "Observation.dataAbsentReason", "method": "keep"},
356356
{"path": "Observation.interpretation", "method": "keep"},
357357
// Skip Observation.note
@@ -363,7 +363,7 @@
363363
{"path": "Observation.hasMember", "method": "keep"},
364364
{"path": "Observation.derivedFrom", "method": "keep"},
365365
{"path": "Observation.component.code", "method": "keep"},
366-
{"path": "Observation.component.value", "method": "keep"}, // should run philter on valueString
366+
{"path": "Observation.component.value", "method": "keep"}, // we drop valueString inside ETL
367367
{"path": "Observation.component.dataAbsentReason", "method": "keep"},
368368
{"path": "Observation.component.interpretation", "method": "keep"},
369369
// Skip Observation.component.referenceRange

cumulus_etl/deid/scrubber.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -406,23 +406,31 @@ def _check_text(self, key: str, value: Any) -> Any:
406406
if isinstance(value, str) and any(
407407
(
408408
# Coding.display is clinically unnecessary but is useful for rendering.
409-
# Since "code" is always present, downstream consumers can & should provide their own display label.
410-
# But we don't remove it entirely, for cases where unexpected codes are used.
411-
# Note that this will definitely over-scrub (like scrubbing "White" from the USCDI race extension),
412-
# but again -- this display value is redundant and rather than try to be smart, we're safely dumb.
409+
# Since "code" is always present, downstream consumers can & should provide their
410+
# own display label. But we don't remove it entirely, for cases where unexpected
411+
# codes are used. Note that this will definitely over-scrub (like scrubbing "White"
412+
# from the USCDI race extension), but again -- this display value is redundant and
413+
# rather than try to be smart, we're safely dumb.
413414
key == "display",
414-
# CodeableConcept.text has clinical value for situations that don't have clear coding yet.
415-
# Think early-days Covid day PCRs. Which is why we let it through in the first place.
415+
# CodeableConcept.text has clinical value for situations that don't have clear
416+
# coding yet, like early-days Covid day PCRs. And text-only codeable concepts show
417+
# up a lot when the EHR allows it. Hence why we normally let it through.
416418
# But we should still scrub it since it is loose text that could hold PHI.
417419
key == "text",
418-
# Observation.valueString has clinical value, but could hold PHI.
419-
# Similarly, extensions might have valueString members (though the only supported ones don't have
420-
# interesting fields there -- race & ethnicity allow for freeform text descriptions).
421-
key == "valueString",
422420
)
423421
):
424422
value = self.scrub_text(value)
425423

424+
if isinstance(value, str) and any(
425+
(
426+
# Observation.valueString has some clinical value, but often holds PHI.
427+
# If we ever want to use this for clinical purposes, we should process it via NLP
428+
# on the ETL side (like we do for DocumentReference attachments).
429+
key == "valueString",
430+
)
431+
):
432+
raise MaskValue
433+
426434
return value
427435

428436
@staticmethod

docs/deid.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,6 @@ There are some freeform text fields that Cumulus ETS asks the Microsoft Anonymiz
155155
These fields are useful for presenting or computing a phenotype:
156156
- `CodeableConcept.text`
157157
- `Coding.display`
158-
- `Observation.valueString` and `Observation.component.valueString`
159158

160159
Although Cumulus wants to largely preserve these fields,
161160
they may contain PHI since they are freeform text fields after all.

pyproject.toml

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,13 @@
22
name = "cumulus-etl"
33
requires-python = ">= 3.10"
44
# These dependencies are mostly pinned to be under the next major version.
5-
# That makes particular sense as long as we don't have official releases yet and our code is used
6-
# by pulling from main directly.
7-
# But now there's a risk of missing a major release and bit-rotting the dependency tree.
5+
# That makes particular sense as long as we don't have official releases yet and our docker
6+
# images are built from main directly every commit.
87
#
9-
# We should either (a) configure a bot to warn us about stale dependencies, or
10-
# (b) once we switch to a more planned release schedule (via docker or similar), just go back to
11-
# open-pinned dependencies so that we're more likely to notice new releases (we'll still have time
12-
# to fix any breakages since users won't immediately see the problem).
8+
# We mitigate the risk of missing a new release by having a bot watch for them.
9+
#
10+
# But if we ever start releasing on PyPI, we could switch to open-pinned dependencies to be
11+
# less annoying to pip users.
1312
dependencies = [
1413
"ctakesclient >= 5.1, < 6",
1514
"cumulus-fhir-support >= 1.2, < 2",

tests/deid/test_deid_philter.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,21 +23,6 @@ def setUp(self):
2323
{"Coding": {"display": "Patient 012-34-5678"}},
2424
{"Coding": {"display": "Patient ***-**-****"}},
2525
),
26-
(
27-
# philter catches the month for some reason, but correctly leaves the date numbers alone
28-
{"resourceType": "Observation", "valueString": "Born on december 12 2012"},
29-
{"resourceType": "Observation", "valueString": "Born on ******** 12 2012"},
30-
),
31-
(
32-
{
33-
"resourceType": "Observation",
34-
"component": [{"valueString": "Contact at foo@bar.com"}],
35-
},
36-
{
37-
"resourceType": "Observation",
38-
"component": [{"valueString": "Contact at ***@***.***"}],
39-
},
40-
),
4126
)
4227
@ddt.unpack
4328
def test_scrub_resource(self, resource, expected):

tests/deid/test_deid_scrubber.py

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,16 @@
1010
from cumulus_etl.deid.codebook import CodebookDB
1111
from tests import i2b2_mock_data, utils
1212

13+
# Used in a few tests - just define once for cleanliness
14+
MASKED_EXTENSION = {
15+
"extension": [
16+
{
17+
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
18+
"valueCode": "masked",
19+
}
20+
]
21+
}
22+
1323

1424
class TestScrubber(utils.AsyncTestCase):
1525
"""Test case for the Scrubber class"""
@@ -103,22 +113,8 @@ def test_documentreference(self):
103113
self.assertEqual(
104114
docref["content"][0]["attachment"],
105115
{
106-
"_data": {
107-
"extension": [
108-
{
109-
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
110-
"valueCode": "masked",
111-
}
112-
]
113-
},
114-
"_url": {
115-
"extension": [
116-
{
117-
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
118-
"valueCode": "masked",
119-
}
120-
]
121-
},
116+
"_data": MASKED_EXTENSION,
117+
"_url": MASKED_EXTENSION,
122118
},
123119
)
124120
self.assertEqual(
@@ -132,14 +128,28 @@ def test_documentreference(self):
132128
}
133129
]
134130
},
135-
"_url": {
136-
"extension": [
137-
{
138-
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
139-
"valueCode": "masked",
140-
}
141-
]
131+
"_url": MASKED_EXTENSION,
132+
},
133+
)
134+
135+
def test_value_string_is_masked(self):
136+
"""Verify that Observation.*.valueString is masked"""
137+
obs = {
138+
"resourceType": "Observation",
139+
"valueString": "Hello Alice!",
140+
"component": [
141+
{
142+
"valueString": "Also heyo Bob!",
142143
},
144+
],
145+
}
146+
self.assertTrue(Scrubber().scrub_resource(obs))
147+
self.assertEqual(
148+
obs,
149+
{
150+
"resourceType": "Observation",
151+
"_valueString": MASKED_EXTENSION,
152+
"component": [{"_valueString": MASKED_EXTENSION}],
143153
},
144154
)
145155

0 commit comments

Comments
 (0)