From cf496b59aee098b6338bf710604c626a9fa63421 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 17 Mar 2025 20:50:38 -0400 Subject: [PATCH 1/4] feat: implement date scrubbing --- llm_gateway/pii_scrubber.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/llm_gateway/pii_scrubber.py b/llm_gateway/pii_scrubber.py index ab02500..f309ae4 100644 --- a/llm_gateway/pii_scrubber.py +++ b/llm_gateway/pii_scrubber.py @@ -127,11 +127,28 @@ def scrub_postal_codes(text: str) -> str: ) +def scrub_dates(text: str) -> str: + """ + Scrub dates in text + + :param text: Input text to scrub + :type text: str + :return: Input text with any datess scrubbed + :rtype: str + """ + return re.sub( + r"\b(?:\d{1,2}[-/.]\d{1,2}[-/.]\d{4}|\d{4}[-/.]\d{1,2}[-/.]\d{1,2})\b", + "[REDACTED DATE]", + text, + ) + + ALL_SCRUBBERS = [ scrub_phone_numbers, scrub_credit_card_numbers, scrub_email_addresses, scrub_postal_codes, + scrub_dates, # move sin scrubber to the end since it's over-eager scrub_sin_numbers, ] From e8157897125de6573f99221577d62dc752fc3bc2 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 17 Mar 2025 20:51:00 -0400 Subject: [PATCH 2/4] feat: add date scrub tests --- tests/test_pii_scrubber.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_pii_scrubber.py b/tests/test_pii_scrubber.py index 48e7913..e0fba92 100644 --- a/tests/test_pii_scrubber.py +++ b/tests/test_pii_scrubber.py @@ -41,6 +41,7 @@ from llm_gateway.pii_scrubber import ( scrub_all, scrub_credit_card_numbers, + scrub_dates, scrub_email_addresses, scrub_phone_numbers, scrub_postal_codes, @@ -145,6 +146,33 @@ def test_scrub_postal_codes(test_postal: str): assert scrub_postal_codes(test_text) == expected_text +@pytest.mark.parametrize( + argnames=["test_date"], + argvalues=[ + ("08/13/2004",), + ("13/08/2004",), + ("2004/13/08",), + ("2004/08/13",), + ("08-13-2004",), + ("13-08-2004",), + ("2004-13-08",), + ("2004-08-13",), + ("08.13.2004",), + ("13.08.2004",), + ("2004.13.08",), + ("2004.08.13",), + ("8/4/2004",), + ], +) +def test_scrub_dates(test_date: str): + """Test date scrubbing.""" + + format_str = "My birthday is {0}." + test_text = format_str.format(test_date) + expected_text = format_str.format("[REDACTED DATE]") + assert scrub_dates(test_text) == expected_text + + def test_scrub_all_dict(): """Test that scrub_all works on a dict.""" From 14de4306b1f46e32f13a776e2d13cb63d5213006 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 17 Mar 2025 21:13:52 -0400 Subject: [PATCH 3/4] chore: update comments --- llm_gateway/pii_scrubber.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_gateway/pii_scrubber.py b/llm_gateway/pii_scrubber.py index f309ae4..a9285a7 100644 --- a/llm_gateway/pii_scrubber.py +++ b/llm_gateway/pii_scrubber.py @@ -133,7 +133,7 @@ def scrub_dates(text: str) -> str: :param text: Input text to scrub :type text: str - :return: Input text with any datess scrubbed + :return: Input text with any dates scrubbed :rtype: str """ return re.sub( From 2b12423f810e6d2e22b0bacd85372113aa66325a Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 18 Mar 2025 09:45:58 -0400 Subject: [PATCH 4/4] chore: update date tests --- tests/test_pii_scrubber.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/tests/test_pii_scrubber.py b/tests/test_pii_scrubber.py index e0fba92..450a2f9 100644 --- a/tests/test_pii_scrubber.py +++ b/tests/test_pii_scrubber.py @@ -149,19 +149,19 @@ def test_scrub_postal_codes(test_postal: str): @pytest.mark.parametrize( argnames=["test_date"], argvalues=[ - ("08/13/2004",), - ("13/08/2004",), - ("2004/13/08",), - ("2004/08/13",), - ("08-13-2004",), - ("13-08-2004",), - ("2004-13-08",), - ("2004-08-13",), - ("08.13.2004",), - ("13.08.2004",), - ("2004.13.08",), - ("2004.08.13",), - ("8/4/2004",), + ("01/01/2000",), + ("02/02/2000",), + ("2000/03/03",), + ("2000/04/04",), + ("05-05-2000",), + ("06-06-2000",), + ("2000-07-07",), + ("2000-08-08",), + ("09.09.2000",), + ("10.10.2000",), + ("2000.11.11",), + ("2000.12.12",), + ("1/1/2000",), ], ) def test_scrub_dates(test_date: str): @@ -246,6 +246,7 @@ def to_dict(self): "My credit card number is 1234-5678-9012-3456", "The user's email is email@123.123.123.123, AKA email@domain.ca", "The user's postal code is A1A 1A1, AKA a1a1A1", + "The user's birthday is 1/1/2000", ], ) @@ -256,6 +257,7 @@ def to_dict(self): "My credit card number is [REDACTED CREDIT CARD NUMBER]", "The user's email is [REDACTED EMAIL ADDRESS], AKA [REDACTED EMAIL ADDRESS]", "The user's postal code is [REDACTED POSTAL CODE], AKA [REDACTED POSTAL CODE]", + "The user's birthday is [REDACTED DATE]", ] # Truncate the result. called_with contains an extra message - the mock response,