Skip to content

feat/pii date scrubbing #88

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions llm_gateway/pii_scrubber.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,28 @@ def scrub_postal_codes(text: str) -> str:
)


def scrub_dates(text: str) -> str:
"""
Scrub dates in text

:param text: Input text to scrub
:type text: str
:return: Input text with any dates scrubbed
:rtype: str
"""
return re.sub(
r"\b(?:\d{1,2}[-/.]\d{1,2}[-/.]\d{4}|\d{4}[-/.]\d{1,2}[-/.]\d{1,2})\b",
"[REDACTED DATE]",
text,
)


ALL_SCRUBBERS = [
scrub_phone_numbers,
scrub_credit_card_numbers,
scrub_email_addresses,
scrub_postal_codes,
scrub_dates,
# move sin scrubber to the end since it's over-eager
scrub_sin_numbers,
]
30 changes: 30 additions & 0 deletions tests/test_pii_scrubber.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from llm_gateway.pii_scrubber import (
scrub_all,
scrub_credit_card_numbers,
scrub_dates,
scrub_email_addresses,
scrub_phone_numbers,
scrub_postal_codes,
Expand Down Expand Up @@ -145,6 +146,33 @@ def test_scrub_postal_codes(test_postal: str):
assert scrub_postal_codes(test_text) == expected_text


@pytest.mark.parametrize(
argnames=["test_date"],
argvalues=[
("01/01/2000",),
("02/02/2000",),
("2000/03/03",),
("2000/04/04",),
("05-05-2000",),
("06-06-2000",),
("2000-07-07",),
("2000-08-08",),
("09.09.2000",),
("10.10.2000",),
("2000.11.11",),
("2000.12.12",),
("1/1/2000",),
],
)
def test_scrub_dates(test_date: str):
"""Test date scrubbing."""

format_str = "My birthday is {0}."
test_text = format_str.format(test_date)
expected_text = format_str.format("[REDACTED DATE]")
assert scrub_dates(test_text) == expected_text


def test_scrub_all_dict():
"""Test that scrub_all works on a dict."""

Expand Down Expand Up @@ -218,6 +246,7 @@ def to_dict(self):
"My credit card number is 1234-5678-9012-3456",
"The user's email is email@123.123.123.123, AKA email@domain.ca",
"The user's postal code is A1A 1A1, AKA a1a1A1",
"The user's birthday is 1/1/2000",
],
)

Expand All @@ -228,6 +257,7 @@ def to_dict(self):
"My credit card number is [REDACTED CREDIT CARD NUMBER]",
"The user's email is [REDACTED EMAIL ADDRESS], AKA [REDACTED EMAIL ADDRESS]",
"The user's postal code is [REDACTED POSTAL CODE], AKA [REDACTED POSTAL CODE]",
"The user's birthday is [REDACTED DATE]",
]

# Truncate the result. called_with contains an extra message - the mock response,
Expand Down