Skip to content

Commit 4b782f2

Browse files
committed
feat: mark data/url fields inside DocRef attachments as absent
When we strip the data & url fields from DocumentReferences, we previously just deleted the fields and moved on. But in order to be able to do some QA analysis on whether those fields were provided in the first place, it's nice to have a record of them. So whenever we delete those fields, we also now leave a data-absent extension in place with the value "masked".
1 parent 352ab34 commit 4b782f2

File tree

6 files changed

+104
-15
lines changed

6 files changed

+104
-15
lines changed

cumulus_etl/deid/scrubber.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ class SkipValue(Exception):
2323
pass
2424

2525

26+
class MaskValue(SkipValue):
27+
pass
28+
29+
2630
class Scrubber:
2731
"""
2832
Manages de-identification for FHIR resources.
@@ -160,6 +164,9 @@ def _scrub_node(
160164
inside_extension=inside_extension,
161165
)
162166
)
167+
except MaskValue:
168+
# TODO: (not needed yet) support masking values inside array fields
169+
self._add_data_absent_extension(node, f"_{key}")
163170
except SkipValue:
164171
pass
165172

@@ -224,6 +231,24 @@ def _print_extension_table(self, title: str, table: ExtensionCount) -> None:
224231
indented = rich.padding.Padding.indent(tree, 1)
225232
rich.get_console().print(indented)
226233

234+
def _add_data_absent_extension(self, node: dict, parent: str) -> None:
235+
element = node.setdefault(parent, {})
236+
extensions = element.setdefault("extension", [])
237+
238+
# Check if the value is already marked as absent for any reason - leave it in place.
239+
# (though that would be weird, since the field was present or we wouldn't be in this path)
240+
for extension in extensions:
241+
if extension.get("url") == "http://hl7.org/fhir/StructureDefinition/data-absent-reason":
242+
return
243+
244+
# See https://hl7.org/fhir/extensions/StructureDefinition-data-absent-reason.html
245+
extensions.append(
246+
{
247+
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
248+
"valueCode": "masked",
249+
}
250+
)
251+
227252
###############################################################################
228253
#
229254
# Individual checkers
@@ -402,7 +427,7 @@ def _check_attachments(resource_type: str, node_path: str, key: str, value: Any)
402427
and node_path == "root.content.attachment"
403428
and key in {"data", "url"}
404429
):
405-
raise SkipValue
430+
raise MaskValue
406431

407432
return value
408433

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
{"resourceType":"DocumentReference","id":"228b982ddae20b8da26a212666995acde914b941a4ff7c314adf89d02c3831f0","subject":{"reference":"Patient\/26f4d6d38eaa3347b8bd22bb4bc66ecbff5384926152738d282e841a247bfefb"},"context":{"encounter":[{"reference":"Encounter\/5388b42b262276bfbcb659b1ff937b0e3e5b0ec8901ed3ad53fa387fd6f2589f"}],"period":{"start":"2021-06-23","end":"2021-06-24"}},"type":{"coding":[{"code":"NOTE:149798455","system":"http:\/\/cumulus.smarthealthit.org\/i2b2","display":"Admission MD"}]},"status":"current","content":[{"attachment":{"contentType":"text\/plain"}}]}
2-
{"resourceType":"DocumentReference","id":"dfc45702900136d5fb09b8737853f5c727132882bd6ba0871942685c0b1df588","subject":{"reference":"Patient\/49fbb06b4b49eb49a096cf2a96674fb84a4d52ee74ec25c8f6f26023cb4764a7"},"context":{"encounter":[{"reference":"Encounter\/fb29ea2a68ca2e1e4bbe22bdeedf021d94ec89f7e3d38ecbe908a8f2b3d89687"}],"period":{"start":"2021-06-24","end":"2021-06-25"}},"type":{"coding":[{"code":"NOTE:149798455","system":"http:\/\/cumulus.smarthealthit.org\/i2b2","display":"Admission MD"}]},"status":"current","content":[{"attachment":{"contentType":"text\/plain"}}]}
1+
{"resourceType": "DocumentReference", "id": "228b982ddae20b8da26a212666995acde914b941a4ff7c314adf89d02c3831f0", "subject": {"reference": "Patient/26f4d6d38eaa3347b8bd22bb4bc66ecbff5384926152738d282e841a247bfefb"}, "context": {"encounter": [{"reference": "Encounter/5388b42b262276bfbcb659b1ff937b0e3e5b0ec8901ed3ad53fa387fd6f2589f"}], "period": {"start": "2021-06-23", "end": "2021-06-24"}}, "type": {"coding": [{"code": "NOTE:149798455", "system": "http://cumulus.smarthealthit.org/i2b2", "display": "Admission MD"}]}, "status": "current", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}]}
2+
{"resourceType": "DocumentReference", "id": "dfc45702900136d5fb09b8737853f5c727132882bd6ba0871942685c0b1df588", "subject": {"reference": "Patient/49fbb06b4b49eb49a096cf2a96674fb84a4d52ee74ec25c8f6f26023cb4764a7"}, "context": {"encounter": [{"reference": "Encounter/fb29ea2a68ca2e1e4bbe22bdeedf021d94ec89f7e3d38ecbe908a8f2b3d89687"}], "period": {"start": "2021-06-24", "end": "2021-06-25"}}, "type": {"coding": [{"code": "NOTE:149798455", "system": "http://cumulus.smarthealthit.org/i2b2", "display": "Admission MD"}]}, "status": "current", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}]}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd","content":[{"attachment":{"contentType":"text\/plain"}}],"context":{"encounter":[{"reference":"Encounter\/d30aad4b-4503-8e22-0bc4-621b94398520"}],"period":{"end":"2021-06-24","start":"2021-06-23"}},"status":"current","subject":{"reference":"Patient\/118dc10e-7745-20d7-e98d-7c358a84c15c"},"type":{"coding":[{"code":"NOTE:149798455","display":"Admission MD","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"resourceType":"DocumentReference"}
1+
{"resourceType": "DocumentReference", "id": "f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}], "context": {"encounter": [{"reference": "Encounter/d30aad4b-4503-8e22-0bc4-621b94398520"}], "period": {"end": "2021-06-24", "start": "2021-06-23"}}, "status": "current", "subject": {"reference": "Patient/118dc10e-7745-20d7-e98d-7c358a84c15c"}, "type": {"coding": [{"code": "NOTE:149798455", "display": "Admission MD", "system": "http://cumulus.smarthealthit.org/i2b2"}]}}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"id":"c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971","content":[{"attachment":{"contentType":"text\/plain"}}],"context":{"encounter":[{"reference":"Encounter\/af1e6186-3f9a-1fa9-3c73-cfa56c84a056"}],"period":{"end":"2021-06-25","start":"2021-06-24"}},"status":"current","subject":{"reference":"Patient\/1de9ea66-70d3-da1f-c735-df5ef7697fb9"},"type":{"coding":[{"code":"NOTE:149798455","display":"Admission MD","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"resourceType":"DocumentReference"}
1+
{"resourceType": "DocumentReference", "id": "c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}], "context": {"encounter": [{"reference": "Encounter/af1e6186-3f9a-1fa9-3c73-cfa56c84a056"}], "period": {"end": "2021-06-25", "start": "2021-06-24"}}, "status": "current", "subject": {"reference": "Patient/1de9ea66-70d3-da1f-c735-df5ef7697fb9"}, "type": {"coding": [{"code": "NOTE:149798455", "display": "Admission MD", "system": "http://cumulus.smarthealthit.org/i2b2"}]}}
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
{"id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd","content":[{"attachment":{"contentType":"text\/plain"}}],"context":{"encounter":[{"reference":"Encounter\/d30aad4b-4503-8e22-0bc4-621b94398520"}],"period":{"end":"2021-06-24","start":"2021-06-23"}},"status":"current","subject":{"reference":"Patient\/118dc10e-7745-20d7-e98d-7c358a84c15c"},"type":{"coding":[{"code":"NOTE:149798455","display":"Admission MD","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"resourceType":"DocumentReference"}
2-
{"id":"c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971","content":[{"attachment":{"contentType":"text\/plain"}}],"context":{"encounter":[{"reference":"Encounter\/af1e6186-3f9a-1fa9-3c73-cfa56c84a056"}],"period":{"end":"2021-06-25","start":"2021-06-24"}},"status":"current","subject":{"reference":"Patient\/1de9ea66-70d3-da1f-c735-df5ef7697fb9"},"type":{"coding":[{"code":"NOTE:149798455","display":"Admission MD","system":"http:\/\/cumulus.smarthealthit.org\/i2b2"}]},"resourceType":"DocumentReference"}
1+
{"resourceType": "DocumentReference", "id": "f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}], "context": {"encounter": [{"reference": "Encounter/d30aad4b-4503-8e22-0bc4-621b94398520"}], "period": {"end": "2021-06-24", "start": "2021-06-23"}}, "status": "current", "subject": {"reference": "Patient/118dc10e-7745-20d7-e98d-7c358a84c15c"}, "type": {"coding": [{"code": "NOTE:149798455", "display": "Admission MD", "system": "http://cumulus.smarthealthit.org/i2b2"}]}}
2+
{"resourceType": "DocumentReference", "id": "c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971", "content": [{"attachment": {"contentType": "text/plain", "_data": {"extension": [{"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", "valueCode": "masked"}]}}}], "context": {"encounter": [{"reference": "Encounter/af1e6186-3f9a-1fa9-3c73-cfa56c84a056"}], "period": {"end": "2021-06-25", "start": "2021-06-24"}}, "status": "current", "subject": {"reference": "Patient/1de9ea66-70d3-da1f-c735-df5ef7697fb9"}, "type": {"coding": [{"code": "NOTE:149798455", "display": "Admission MD", "system": "http://cumulus.smarthealthit.org/i2b2"}]}}

tests/deid/test_deid_scrubber.py

Lines changed: 72 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,36 @@ def test_condition(self):
5858

5959
def test_documentreference(self):
6060
"""Test DocumentReference, which is interesting because of its list of encounters and attachments"""
61-
docref = i2b2_mock_data.documentreference()
62-
self.assertEqual("345", docref["id"])
63-
self.assertEqual("Patient/12345", docref["subject"]["reference"])
64-
self.assertEqual(1, len(docref["context"]["encounter"]))
65-
self.assertEqual("Encounter/67890", docref["context"]["encounter"][0]["reference"])
66-
self.assertEqual(1, len(docref["content"]))
67-
self.assertIsNotNone(docref["content"][0]["attachment"]["data"])
61+
docref = {
62+
"resourceType": "DocumentReference",
63+
"id": "345",
64+
"subject": {"reference": "Patient/12345"},
65+
"context": {
66+
"encounter": [{"reference": "Encounter/67890"}],
67+
},
68+
"content": [
69+
{
70+
"attachment": {
71+
"data": "aGVsbG8gd29ybGQ=",
72+
"url": "https://example.com/hello-world",
73+
},
74+
},
75+
{
76+
"attachment": {
77+
"data": "xxx",
78+
"_data": {
79+
"extension": [
80+
{
81+
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
82+
"valueCode": "error",
83+
}
84+
],
85+
},
86+
"url": "https://example.com/hello-world",
87+
},
88+
},
89+
],
90+
}
6891

6992
scrubber = Scrubber()
7093
self.assertTrue(scrubber.scrub_resource(docref))
@@ -77,7 +100,48 @@ def test_documentreference(self):
77100
docref["context"]["encounter"][0]["reference"],
78101
f"Encounter/{scrubber.codebook.fake_id('Encounter', '67890')}",
79102
)
80-
self.assertNotIn("data", docref["content"][0]["attachment"])
103+
self.assertEqual(
104+
docref["content"][0]["attachment"],
105+
{
106+
"_data": {
107+
"extension": [
108+
{
109+
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
110+
"valueCode": "masked",
111+
}
112+
]
113+
},
114+
"_url": {
115+
"extension": [
116+
{
117+
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
118+
"valueCode": "masked",
119+
}
120+
]
121+
},
122+
},
123+
)
124+
self.assertEqual(
125+
docref["content"][1]["attachment"],
126+
{
127+
"_data": {
128+
"extension": [
129+
{
130+
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
131+
"valueCode": "error", # we left this reason in place
132+
}
133+
]
134+
},
135+
"_url": {
136+
"extension": [
137+
{
138+
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
139+
"valueCode": "masked",
140+
}
141+
]
142+
},
143+
},
144+
)
81145

82146
def test_contained_reference(self):
83147
"""Verify that we leave contained references contained but scrubbed"""

0 commit comments

Comments
 (0)