diff --git a/picard/util/__init__.py b/picard/util/__init__.py index 739ee1647b..24cfd4e586 100644 --- a/picard/util/__init__.py +++ b/picard/util/__init__.py @@ -85,6 +85,7 @@ gettext as _, gettext_constants, ) +from picard.util.datestr_util import sanitize_date # noqa: F401 winreg = None @@ -323,29 +324,6 @@ def format_time(ms, display_zero=False): return "%d:%02d:%02d" % (hours, minutes, seconds) -def sanitize_date(datestr): - """Sanitize date format. - - e.g.: "1980-00-00" -> "1980" - "1980- - " -> "1980" - "1980-00-23" -> "1980-00-23" - ... - """ - date = [] - for num in reversed(datestr.split("-")): - try: - num = int(num.strip()) - except ValueError: - if num == '': - num = 0 - else: - break - if num or (num == 0 and date): - date.append(num) - date.reverse() - return ("", "%04d", "%04d-%02d", "%04d-%02d-%02d")[len(date)] % tuple(date) - - def replace_win32_incompat(string, repl="_", replacements=None): # noqa: E302 """Replace win32 filename incompatible characters from ``string`` by ``repl``.""" diff --git a/picard/util/datestr_util.py b/picard/util/datestr_util.py new file mode 100644 index 0000000000..1277860a02 --- /dev/null +++ b/picard/util/datestr_util.py @@ -0,0 +1,248 @@ +# -*- coding: utf-8 -*- +# +# Picard, the next-generation MusicBrainz tagger +# +# Copyright (C) 2025 The MusicBrainz Team +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +"""Utility functions for date str/obj.""" + +import re + + +def _int_or_none(value: str) -> int | None: + value = value.strip() + if value == "": + return 0 + try: + return int(value) + except ValueError: + return None + + +def _clamp_year(value: int | None) -> int | None: + if value is None: + return None + if value < 0: + return None + if value > 9999: + return None + return value + + +def _is_valid_month(value: int) -> bool: + return 0 <= value <= 12 + + +def _is_valid_day(value: int) -> bool: + return 0 <= value <= 31 + + +def _parse_pure_year(value: str) -> str | None: + m = re.fullmatch(r"(\d{4})", value) + if not m: + return None + year = _clamp_year(int(m.group(1))) + if year in (None, 0): + return "" + return f"{year:04d}" + + +def _format_from_components(year: int | None, month: int | None, day: int | None) -> str: + y = _clamp_year(year if year is not None else -1) + if y is None: + return "" + # y could be 0 here for partial unknown dates; treat specially below + if month is None and day is None: + return "" if y == 0 else f"{y:04d}" + + m = month if month is not None else 0 + d = day if day is not None else 0 + if not _is_valid_month(m) or not _is_valid_day(d): + return "" + + if d == 0: + if m == 0: + return "" if y == 0 else f"{y:04d}" + return f"{y:04d}-{m:02d}" + + if m == 0: + return f"{y:04d}-00-{d:02d}" + + return f"{y:04d}-{m:02d}-{d:02d}" + + +def _parse_iso_like(value: str) -> str | None: + if "-" not in value: + return None + parts = value.split("-") + if not (1 <= len(parts) <= 3): + return None + + parsed: list[int] = [] + for part in reversed(parts): + num = _int_or_none(part) + if num is None: + return None + if num or (num == 0 and parsed): + parsed.append(num) + parsed.reverse() + + if len(parsed) == 1: + return _format_from_components(parsed[0], None, None) + if len(parsed) == 2: + y, m = parsed + return _format_from_components(y, m, None) + if len(parsed) == 3: + y, m, d = parsed + # Handle swapped middle and last if clearly YYYY-DD-MM + if m > 12 and d <= 12: + m, d = d, m + return _format_from_components(y, m, d) + return None + + +def _parse_slash_separated(value: str) -> str | None: + # We only expect 3 tokens (two slashes) + try: + tokens = value.split("/", 2) + a = _int_or_none(tokens[0]) + b = _int_or_none(tokens[1]) + c = _int_or_none(tokens[2]) + except IndexError: + return None + + if a is None or b is None or c is None or not (0 < c <= 9999): + return None + y = _clamp_year(c) or None + if y is None: + return "" + if a > 12 and _is_valid_month(b) and _is_valid_day(a): + d, m = a, b + elif b > 12 and _is_valid_month(a) and _is_valid_day(b): + m, d = a, b + else: + return None + if not _is_valid_month(m) or not _is_valid_day(d) or m == 0: + return "" + if d == 0: + return f"{y:04d}-{m:02d}" + return f"{y:04d}-{m:02d}-{d:02d}" + + +def _parse_compact_eight(value: str) -> str | None: + m = re.fullmatch(r"(\d{4})(\d{2})(\d{2})", value) + if not m: + return None + + y = _clamp_year(int(m.group(1))) + if y is None: + return "" + + a = int(m.group(2)) + b = int(m.group(3)) + if _is_valid_month(a) and _is_valid_day(b) and a != 0: + return f"{y:04d}-{a:02d}-{b:02d}" + if _is_valid_month(b) and _is_valid_day(a) and b != 0: + return f"{y:04d}-{b:02d}-{a:02d}" + if a == 0 and _is_valid_day(b): + return f"{y:04d}" + if _is_valid_month(a) and b == 0: + if a == 0: + return f"{y:04d}" + return f"{y:04d}-{a:02d}" + return "" + + +def sanitize_date(datestr: str, disable_sanitization: bool = False) -> str: + """Normalize a date string with optional sanitization bypass. + + Parameters + ---------- + datestr : str + Raw date string to normalize. Supported inputs include: + - "YYYY", "YYYY-MM", "YYYY-MM-DD" + - "YYYY-DD-MM" (normalized to "YYYY-MM-DD" when unambiguous) + - "DD/MM/YYYY" and "MM/DD/YYYY" (disambiguated heuristically) + - "YYYYMMDD" and "YYYYDDMM" + Unknown components may be provided as "00" (or left empty for + hyphenated inputs, e.g. "2006--"), meaning the component is unknown. + disable_sanitization : bool, default False + If True, returns the input unchanged. + + Returns + ------- + str + Normalized date in one of: "YYYY", "YYYY-MM", or "YYYY-MM-DD". + Returns an empty string if the input cannot be normalized when + sanitization is enabled. + + Notes + ----- + - Unknown components are preserved without shifting other components. + For example, month "00" with a known day stays as month "00": + "2005-00-12" -> "2005-00-12" (not "2005-12"). + - Day "00" drops the day component: "2005-12-00" -> "2005-12". + - Month "00" drops only the month when no day is given: "2005-00" -> "2005". + - Fully unknown date "0000-00-00" normalizes to an empty string, but + partially unknown values such as "0000-00-23" are preserved. + + Examples + -------- + >>> sanitize_date('2005-12-00') + '2005-12' + >>> sanitize_date('2005-00-12') + '2005-00-12' + >>> sanitize_date('31/12/2005') + '2005-12-31' + >>> sanitize_date('20051231') + '2005-12-31' + >>> sanitize_date('20053112') + '2005-12-31' + >>> sanitize_date('2006--') + '2006' + """ + if disable_sanitization: + return datestr + + if not datestr or not isinstance(datestr, str): + return "" + + value = datestr.strip() + if value == "": + return "" + + # 1) Pure year + result = _parse_pure_year(value) + if result is not None: + return result + + # 2) Hyphen-separated ISO-like + result = _parse_iso_like(value) + if result is not None: + return result + + # 3) Slash-separated + result = _parse_slash_separated(value) + if result is not None: + return result + + # 4) Compact YYYYMMDD or YYYYDDMM + result = _parse_compact_eight(value) + if result is not None: + return result + + return "" diff --git a/test/test_dateutil_normalize.py b/test/test_dateutil_normalize.py new file mode 100644 index 0000000000..cbe28e0166 --- /dev/null +++ b/test/test_dateutil_normalize.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- +# +# Picard, the next-generation MusicBrainz tagger +# +# Copyright (C) 2025 The MusicBrainz Team +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +"""Test sanitize_date function.""" + +from collections.abc import Callable + +from picard.util import sanitize_date + +import pytest + + +@pytest.fixture +def norm() -> Callable[[str], str]: + return sanitize_date + + +@pytest.mark.parametrize( + ("src", "expected"), + [ + ("", ""), + ("0", ""), + ("0000", ""), + ("2006", "2006"), + ("2006--", "2006"), + ("2006-00-02", "2006-00-02"), + ("2006 ", "2006"), + ("2006 02", ""), + ("2006.02", ""), + ("2006-02", "2006-02"), + ("2006-02-00", "2006-02"), + ("2006-00-00", "2006"), + ("2006-02-23", "2006-02-23"), + ("2006-00-23", "2006-00-23"), + ("0000-00-23", "0000-00-23"), + ("0000-02", "0000-02"), + ("--23", "0000-00-23"), + ], +) +def test_sanitize_date_basic(norm: Callable, src: str, expected: str) -> None: + assert norm(src) == expected + + +@pytest.mark.parametrize( + ("src", "expected"), + [ + ("2005-12-00", "2005-12"), + ("2005-00-12", "2005-00-12"), # bugfix: don't shift 00 month + ("0000-00-12", "0000-00-12"), # bugfix: don't become 0012 + ("0000-00-00", ""), + ], +) +def test_sanitize_date_bug_cases(norm: Callable, src: str, expected: str) -> None: + assert norm(src) == expected + + +@pytest.mark.parametrize( + ("src", "expected"), + [ + ("31/12/2005", "2005-12-31"), + ("12/31/2005", "2005-12-31"), + ("20051231", "2005-12-31"), + ("20053112", "2005-12-31"), + ], +) +def test_sanitize_date_other_formats(norm: Callable, src: str, expected: str) -> None: + assert norm(src) == expected + + +@pytest.mark.parametrize( + "src", + [ + ("nonsense",), + ("2006/13/01",), + ("2006-13-01",), + ], +) +def test_sanitize_date_invalid(norm: Callable, src: str) -> None: + assert norm(src) == "" + + +@pytest.mark.parametrize( + "src", + [ + "0000-00-00", + "2005-00-12", + "2005-12-00", + ], +) +def test_disable_sanitization_returns_input(src: str) -> None: + assert sanitize_date(src, disable_sanitization=True) == src