Skip to content

Commit e7c3ac5

Browse files
committed
Add more resources
1 parent 2a2834d commit e7c3ac5

File tree

13 files changed

+104
-27
lines changed

13 files changed

+104
-27
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
repos:
22
- repo: https://github.com/astral-sh/ruff-pre-commit
3-
rev: v0.12.1 # keep in rough sync with pyproject.toml
3+
rev: v0.13.0 # keep in rough sync with pyproject.toml
44
hooks:
55
- name: Ruff formatting
66
id: ruff-format

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ RUN python3 -m nltk.downloader -d /usr/local/share/nltk_data averaged_perceptron
4141
COPY . /app
4242

4343
ARG ETL_VERSION
44-
RUN [ -z "$ETL_VERSION" ] || sed -i "s/0\.0\.0/$ETL_VERSION/" /app/cumulus_etl/__init__.py
44+
RUN [ -z "$ETL_VERSION" ] || sed -i "s/1\!0\.0\.0/$ETL_VERSION/" /app/cumulus_etl/__init__.py
4545
# Print the final version we're using
4646
RUN grep __version__ /app/cumulus_etl/__init__.py
4747

cumulus_etl/deid/ms-config.json

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,27 @@
283283
{"path": "Immunization.protocolApplied.doseNumber", "method": "keep"}, // caution: non-PHI freeform string
284284
{"path": "Immunization.protocolApplied.seriesDoses", "method": "keep"}, // caution: non-PHI freeform string
285285

286+
// ** Location: https://www.hl7.org/fhir/R4/location.html **
287+
// This is not a patient-linked resource, so we aren't as worried about PHI here.
288+
{"path": "Location.identifier.where(system='http://hl7.org/fhir/sid/us-npi')", "method": "keep"},
289+
{"path": "Location.status", "method": "keep"},
290+
{"path": "Location.operationalStatus", "method": "keep"},
291+
{"path": "Location.name", "method": "keep"},
292+
{"path": "Location.alias", "method": "keep"},
293+
// Skip Location.description
294+
{"path": "Location.mode", "method": "keep"},
295+
{"path": "Location.type", "method": "keep"},
296+
// Skip Location.telecom and address
297+
{"path": "Location.physicalType", "method": "keep"},
298+
// Skip Location.position
299+
{"path": "Location.managingOrganization", "method": "keep"},
300+
{"path": "Location.partOf", "method": "keep"},
301+
{"path": "Location.hoursOfOperation.daysOfWeek", "method": "keep"},
302+
{"path": "Location.hoursOfOperation.allDay", "method": "keep"},
303+
{"path": "Location.hoursOfOperation.openingTime", "method": "keep"},
304+
{"path": "Location.hoursOfOperation.closingTime", "method": "keep"},
305+
// Skip Location.availabilityExceptions and endpoint
306+
286307
// ** Medication: https://www.hl7.org/fhir/R4/medication.html **
287308
// Skip Medication.identifier
288309
{"path": "Medication.code", "method": "keep"},
@@ -368,6 +389,17 @@
368389
{"path": "Observation.component.interpretation", "method": "keep"},
369390
// Skip Observation.component.referenceRange
370391

392+
// ** Organization: https://www.hl7.org/fhir/R4/organization.html **
393+
// This is not a patient-linked resource, so we aren't as worried about PHI here.
394+
{"path": "Organization.identifier.where(system='http://hl7.org/fhir/sid/us-npi')", "method": "keep"},
395+
{"path": "Organization.active", "method": "keep"},
396+
{"path": "Organization.type", "method": "keep"},
397+
{"path": "Organization.name", "method": "keep"},
398+
{"path": "Organization.alias", "method": "keep"},
399+
// Skip Organization.telecom and address
400+
{"path": "Organization.partOf", "method": "keep"},
401+
// Skip Organization.contact and endpoint
402+
371403
// ** Patient: https://www.hl7.org/fhir/R4/patient.html **
372404
// Skip Patient.identifier
373405
{"path": "Patient.active", "method": "keep"},
@@ -396,6 +428,44 @@
396428
{"path": "Patient.link.other", "method": "keep"},
397429
{"path": "Patient.link.type", "method": "keep"},
398430

431+
// ** Practitioner: https://www.hl7.org/fhir/R4/practitioner.html **
432+
// This is not a patient-linked resource, so we aren't as worried about PHI here.
433+
// But it's also a person, so exhibit some caution.
434+
{"path": "Practitioner.identifier.where(system='http://hl7.org/fhir/sid/us-npi')", "method": "keep"},
435+
{"path": "Practitioner.active", "method": "keep"},
436+
// Skip Practitioner.name, telecom, and address
437+
{"path": "Practitioner.gender", "method": "keep"},
438+
{"path": "Practitioner.birthDate", "method": "generalize",
439+
// keep just the year for privacy (note: 90+ HIPAA grouping is done downstream in SQL
440+
"cases": {"true": "$this.toString().replaceMatches('^(?<year>\\\\d+).*', '${year}')"}},
441+
// Skip Practitioner.photo
442+
{"path": "Practitioner.qualification.identifier", "method": "keep"},
443+
{"path": "Practitioner.qualification.code", "method": "keep"},
444+
{"path": "Practitioner.qualification.period", "method": "keep"},
445+
{"path": "Practitioner.qualification.issuer", "method": "keep"},
446+
{"path": "Practitioner.communication", "method": "keep"},
447+
448+
// ** PractitionerRole: https://www.hl7.org/fhir/R4/practitionerrole.html **
449+
// This is not a patient-linked resource, so we aren't as worried about PHI here.
450+
// But it's also closely associated with a person, so exhibit some caution.
451+
{"path": "PractitionerRole.identifier.where(system='http://hl7.org/fhir/sid/us-npi')", "method": "keep"},
452+
{"path": "PractitionerRole.active", "method": "keep"},
453+
{"path": "PractitionerRole.period", "method": "keep"},
454+
{"path": "PractitionerRole.practitioner", "method": "keep"},
455+
{"path": "PractitionerRole.organization", "method": "keep"},
456+
{"path": "PractitionerRole.code", "method": "keep"},
457+
{"path": "PractitionerRole.specialty", "method": "keep"},
458+
{"path": "PractitionerRole.location", "method": "keep"},
459+
{"path": "PractitionerRole.healthcareService", "method": "keep"},
460+
// Skip PractitionerRole.telecom
461+
{"path": "PractitionerRole.availableTime.daysOfWeek", "method": "keep"},
462+
{"path": "PractitionerRole.availableTime.allDay", "method": "keep"},
463+
{"path": "PractitionerRole.availableTime.availableStartTime", "method": "keep"},
464+
{"path": "PractitionerRole.availableTime.availableEndTime", "method": "keep"},
465+
// Skip PractitionerRole.notAvailable.description
466+
{"path": "PractitionerRole.notAvailable.during", "method": "keep"},
467+
// Skip PractitionerRole.availabilityExceptions and endpoint
468+
399469
// ** Procedure: https://www.hl7.org/fhir/R4/procedure.html **
400470
// Skip Procedure.identifier
401471
{"path": "Procedure.instantiatesCanonical", "method": "keep"},

cumulus_etl/deid/scrubber.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ def _check_extensions(
329329
"http://open.epic.com/FHIR/STU3/StructureDefinition/patient-preferred-provider-sex",
330330
"http://open.epic.com/FHIR/STU3/StructureDefinition/temperature-in-fahrenheit",
331331
"http://open.epic.com/FHIR/R4/StructureDefinition/patient-preferred-provider-sex",
332+
"https://open.epic.com/fhir/extensions/specialty",
332333
"https://open.epic.com/FHIR/StructureDefinition/patient-merge-target-reference",
333334
"https://open.epic.com/FHIR/StructureDefinition/patient-merge-unmerge-instant",
334335
# A Netherlands extension used by Epic

cumulus_etl/etl/studies/covid_symptom/covid_tasks.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,7 @@ def is_ed_docref(docref) -> bool:
8181
class BaseCovidCtakesTask(tasks.BaseNlpTask):
8282
"""Covid Symptom study task, to generate symptom lists from ED notes using cTAKES + a polarity check"""
8383

84-
tags: ClassVar = {"covid_symptom", "gpu"}
85-
86-
# Subclasses: set name, tags, and polarity_model yourself
84+
# Subclasses: set name and polarity_model yourself
8785
polarity_model = None
8886

8987
# Use a shared task_version for subclasses, to make sharing the ctakes cache folder easier
@@ -245,7 +243,6 @@ class BaseCovidGptTask(tasks.BaseOpenAiTask):
245243
"""Covid Symptom study task, using GPT"""
246244

247245
outputs: ClassVar = [tasks.OutputTable(resource_type=None)]
248-
tags: ClassVar = {"covid_symptom", "cpu"}
249246
system_prompt = "You are a helpful assistant."
250247
user_prompt = (
251248
"### Instructions ###\n"

cumulus_etl/etl/studies/irae/irae_tasks.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -433,22 +433,18 @@ class BaseIraeTask(tasks.BaseOpenAiTaskWithSpans):
433433
class IraeGpt4oTask(BaseIraeTask):
434434
name = "irae__nlp_gpt4o"
435435
client_class = nlp.Gpt4oModel
436-
tags: ClassVar = {"irae", "cpu"}
437436

438437

439438
class IraeGpt5Task(BaseIraeTask):
440439
name = "irae__nlp_gpt5"
441440
client_class = nlp.Gpt5Model
442-
tags: ClassVar = {"irae", "cpu"}
443441

444442

445443
class IraeGptOss120bTask(BaseIraeTask):
446444
name = "irae__nlp_gpt_oss_120b"
447445
client_class = nlp.GptOss120bModel
448-
tags: ClassVar = {"irae", "gpu"}
449446

450447

451448
class IraeLlama4ScoutTask(BaseIraeTask):
452449
name = "irae__nlp_llama4_scout"
453450
client_class = nlp.Llama4ScoutModel
454-
tags: ClassVar = {"irae", "gpu"}

cumulus_etl/etl/tasks/base.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,6 @@ class EtlTask:
9090
name: ClassVar[str] = None # task & table name
9191
# incoming resource that this task operates on (will be included in bulk exports etc)
9292
resource: ClassVar[str | set[str]] = None
93-
tags: ClassVar[set[str]] = []
9493
# whether this task needs bulk MS tool de-id run on its inputs (NLP tasks usually don't)
9594
needs_bulk_deid: ClassVar[bool] = True
9695

cumulus_etl/etl/tasks/basic_tasks.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,39 +15,33 @@
1515
class AllergyIntoleranceTask(tasks.EtlTask):
1616
name: ClassVar = "allergyintolerance"
1717
resource: ClassVar = "AllergyIntolerance"
18-
tags: ClassVar = {"cpu"}
1918

2019

2120
class ConditionTask(tasks.EtlTask):
2221
name: ClassVar = "condition"
2322
resource: ClassVar = "Condition"
24-
tags: ClassVar = {"cpu"}
2523

2624

2725
class DeviceTask(tasks.EtlTask):
2826
name: ClassVar = "device"
2927
resource: ClassVar = "Device"
30-
tags: ClassVar = {"cpu"}
3128

3229

3330
class DiagnosticReportTask(tasks.EtlTask):
3431
name: ClassVar = "diagnosticreport"
3532
resource: ClassVar = "DiagnosticReport"
36-
tags: ClassVar = {"cpu"}
3733

3834

3935
class DocumentReferenceTask(tasks.EtlTask):
4036
name: ClassVar = "documentreference"
4137
resource: ClassVar = "DocumentReference"
42-
tags: ClassVar = {"cpu"}
4338

4439

4540
class EncounterTask(tasks.EtlTask):
4641
"""Processes Encounter FHIR resources"""
4742

4843
name: ClassVar = "encounter"
4944
resource: ClassVar = "Encounter"
50-
tags: ClassVar = {"cpu"}
5145

5246
# Encounters are a little more complicated than normal FHIR resources.
5347
# We also write out a table tying Encounters to a group name, for completion tracking.
@@ -80,15 +74,18 @@ def get_schema(cls, resource_type: str | None, rows: list[dict]) -> pyarrow.Sche
8074
class ImmunizationTask(tasks.EtlTask):
8175
name: ClassVar = "immunization"
8276
resource: ClassVar = "Immunization"
83-
tags: ClassVar = {"cpu"}
77+
78+
79+
class LocationTask(tasks.EtlTask):
80+
name = "location"
81+
resource = "Location"
8482

8583

8684
class MedicationRequestTask(tasks.EtlTask):
8785
"""Write MedicationRequest resources and associated Medication resources"""
8886

8987
name: ClassVar = "medicationrequest"
9088
resource: ClassVar = "MedicationRequest"
91-
tags: ClassVar = {"cpu"}
9289

9390
# We may write to a second Medication table as we go.
9491
# MedicationRequest can have inline medications via CodeableConcepts, or external Medication
@@ -194,22 +191,33 @@ async def read_entries(self, *, progress: rich.progress.Progress = None) -> task
194191
class ObservationTask(tasks.EtlTask):
195192
name: ClassVar = "observation"
196193
resource: ClassVar = "Observation"
197-
tags: ClassVar = {"cpu"}
194+
195+
196+
class OrganizationTask(tasks.EtlTask):
197+
name = "organization"
198+
resource = "Organization"
198199

199200

200201
class PatientTask(tasks.EtlTask):
201202
name: ClassVar = "patient"
202203
resource: ClassVar = "Patient"
203-
tags: ClassVar = {"cpu"}
204+
205+
206+
class PractitionerTask(tasks.EtlTask):
207+
name = "practitioner"
208+
resource = "Practitioner"
209+
210+
211+
class PractitionerRoleTask(tasks.EtlTask):
212+
name = "practitionerrole"
213+
resource = "PractitionerRole"
204214

205215

206216
class ProcedureTask(tasks.EtlTask):
207217
name: ClassVar = "procedure"
208218
resource: ClassVar = "Procedure"
209-
tags: ClassVar = {"cpu"}
210219

211220

212221
class ServiceRequestTask(tasks.EtlTask):
213222
name: ClassVar = "servicerequest"
214223
resource: ClassVar = "ServiceRequest"
215-
tags: ClassVar = {"cpu"}

cumulus_etl/etl/tasks/nlp_task.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ class BaseNlpTask(tasks.EtlTask):
3535
# maybe add a group_field? (remember to call self.seen_docrefs.add() if so)
3636
tasks.OutputTable(resource_type=None)
3737
]
38-
tags: ClassVar = {"gpu"} # maybe a study identifier?
3938

4039
# Task Version
4140
# The "task_version" field is a simple integer that gets incremented any time an NLP-relevant parameter is changed.

cumulus_etl/etl/tasks/task_factory.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,12 @@ def get_default_tasks() -> list[type[AnyTask]]:
6363
basic_tasks.DiagnosticReportTask,
6464
basic_tasks.DocumentReferenceTask,
6565
basic_tasks.ImmunizationTask,
66+
basic_tasks.LocationTask,
6667
basic_tasks.MedicationRequestTask,
6768
basic_tasks.ObservationTask,
69+
basic_tasks.OrganizationTask,
70+
basic_tasks.PractitionerTask,
71+
basic_tasks.PractitionerRoleTask,
6872
basic_tasks.ProcedureTask,
6973
basic_tasks.ServiceRequestTask,
7074
]

0 commit comments

Comments
 (0)