Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 1 addition & 23 deletions picard/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
gettext as _,
gettext_constants,
)
from picard.util.datestr_util import sanitize_date # noqa: F401


winreg = None
Expand Down Expand Up @@ -323,29 +324,6 @@ def format_time(ms, display_zero=False):
return "%d:%02d:%02d" % (hours, minutes, seconds)


def sanitize_date(datestr):
"""Sanitize date format.

e.g.: "1980-00-00" -> "1980"
"1980- - " -> "1980"
"1980-00-23" -> "1980-00-23"
...
"""
date = []
for num in reversed(datestr.split("-")):
try:
num = int(num.strip())
except ValueError:
if num == '':
num = 0
else:
break
if num or (num == 0 and date):
date.append(num)
date.reverse()
return ("", "%04d", "%04d-%02d", "%04d-%02d-%02d")[len(date)] % tuple(date)


def replace_win32_incompat(string, repl="_", replacements=None): # noqa: E302
"""Replace win32 filename incompatible characters from ``string`` by
``repl``."""
Expand Down
248 changes: 248 additions & 0 deletions picard/util/datestr_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
# -*- coding: utf-8 -*-
#
# Picard, the next-generation MusicBrainz tagger
#
# Copyright (C) 2025 The MusicBrainz Team
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

"""Utility functions for date str/obj."""

import re


def _int_or_none(value: str) -> int | None:
value = value.strip()
if value == "":
return 0
try:
return int(value)
except ValueError:
return None


def _clamp_year(value: int | None) -> int | None:
if value is None:
return None
if value < 0:
return None
if value > 9999:
return None
return value


def _is_valid_month(value: int) -> bool:
return 0 <= value <= 12


def _is_valid_day(value: int) -> bool:
return 0 <= value <= 31


def _parse_pure_year(value: str) -> str | None:
m = re.fullmatch(r"(\d{4})", value)
if not m:
return None
year = _clamp_year(int(m.group(1)))
if year in (None, 0):
return ""
return f"{year:04d}"


def _format_from_components(year: int | None, month: int | None, day: int | None) -> str:
y = _clamp_year(year if year is not None else -1)
if y is None:
return ""
# y could be 0 here for partial unknown dates; treat specially below
if month is None and day is None:
return "" if y == 0 else f"{y:04d}"

m = month if month is not None else 0
d = day if day is not None else 0
if not _is_valid_month(m) or not _is_valid_day(d):
return ""

if d == 0:
if m == 0:
return "" if y == 0 else f"{y:04d}"
return f"{y:04d}-{m:02d}"

if m == 0:
return f"{y:04d}-00-{d:02d}"

return f"{y:04d}-{m:02d}-{d:02d}"


def _parse_iso_like(value: str) -> str | None:
if "-" not in value:
return None
parts = value.split("-")
if not (1 <= len(parts) <= 3):
return None

parsed: list[int] = []
for part in reversed(parts):
num = _int_or_none(part)
if num is None:
return None
if num or (num == 0 and parsed):
parsed.append(num)
parsed.reverse()

if len(parsed) == 1:
return _format_from_components(parsed[0], None, None)
if len(parsed) == 2:
y, m = parsed
return _format_from_components(y, m, None)
if len(parsed) == 3:
y, m, d = parsed
# Handle swapped middle and last if clearly YYYY-DD-MM
if m > 12 and d <= 12:
m, d = d, m
return _format_from_components(y, m, d)
return None


def _parse_slash_separated(value: str) -> str | None:
# We only expect 3 tokens (two slashes)
try:
tokens = value.split("/", 2)
a = _int_or_none(tokens[0])
b = _int_or_none(tokens[1])
c = _int_or_none(tokens[2])
except IndexError:
return None

if a is None or b is None or c is None or not (0 < c <= 9999):
return None
y = _clamp_year(c) or None
if y is None:
return ""
if a > 12 and _is_valid_month(b) and _is_valid_day(a):
d, m = a, b
elif b > 12 and _is_valid_month(a) and _is_valid_day(b):
m, d = a, b
else:
return None
if not _is_valid_month(m) or not _is_valid_day(d) or m == 0:
return ""
if d == 0:
return f"{y:04d}-{m:02d}"
return f"{y:04d}-{m:02d}-{d:02d}"


def _parse_compact_eight(value: str) -> str | None:
m = re.fullmatch(r"(\d{4})(\d{2})(\d{2})", value)
if not m:
return None

y = _clamp_year(int(m.group(1)))
if y is None:
return ""

a = int(m.group(2))
b = int(m.group(3))
if _is_valid_month(a) and _is_valid_day(b) and a != 0:
return f"{y:04d}-{a:02d}-{b:02d}"
if _is_valid_month(b) and _is_valid_day(a) and b != 0:
return f"{y:04d}-{b:02d}-{a:02d}"
if a == 0 and _is_valid_day(b):
return f"{y:04d}"
if _is_valid_month(a) and b == 0:
if a == 0:
return f"{y:04d}"
return f"{y:04d}-{a:02d}"
return ""


def sanitize_date(datestr: str, disable_sanitization: bool = False) -> str:
"""Normalize a date string with optional sanitization bypass.

Parameters
----------
datestr : str
Raw date string to normalize. Supported inputs include:
- "YYYY", "YYYY-MM", "YYYY-MM-DD"
- "YYYY-DD-MM" (normalized to "YYYY-MM-DD" when unambiguous)
- "DD/MM/YYYY" and "MM/DD/YYYY" (disambiguated heuristically)
- "YYYYMMDD" and "YYYYDDMM"
Unknown components may be provided as "00" (or left empty for
hyphenated inputs, e.g. "2006--"), meaning the component is unknown.
disable_sanitization : bool, default False
If True, returns the input unchanged.

Returns
-------
str
Normalized date in one of: "YYYY", "YYYY-MM", or "YYYY-MM-DD".
Returns an empty string if the input cannot be normalized when
sanitization is enabled.

Notes
-----
- Unknown components are preserved without shifting other components.
For example, month "00" with a known day stays as month "00":
"2005-00-12" -> "2005-00-12" (not "2005-12").
- Day "00" drops the day component: "2005-12-00" -> "2005-12".
- Month "00" drops only the month when no day is given: "2005-00" -> "2005".
- Fully unknown date "0000-00-00" normalizes to an empty string, but
partially unknown values such as "0000-00-23" are preserved.

Examples
--------
>>> sanitize_date('2005-12-00')
'2005-12'
>>> sanitize_date('2005-00-12')
'2005-00-12'
>>> sanitize_date('31/12/2005')
'2005-12-31'
>>> sanitize_date('20051231')
'2005-12-31'
>>> sanitize_date('20053112')
'2005-12-31'
>>> sanitize_date('2006--')
'2006'
"""
if disable_sanitization:
return datestr

if not datestr or not isinstance(datestr, str):
return ""

value = datestr.strip()
if value == "":
return ""

# 1) Pure year
result = _parse_pure_year(value)
if result is not None:
return result

# 2) Hyphen-separated ISO-like
result = _parse_iso_like(value)
if result is not None:
return result

# 3) Slash-separated
result = _parse_slash_separated(value)
if result is not None:
return result

# 4) Compact YYYYMMDD or YYYYDDMM
result = _parse_compact_eight(value)
if result is not None:
return result

return ""
108 changes: 108 additions & 0 deletions test/test_dateutil_normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# -*- coding: utf-8 -*-
#
# Picard, the next-generation MusicBrainz tagger
#
# Copyright (C) 2025 The MusicBrainz Team
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

"""Test sanitize_date function."""

from collections.abc import Callable

from picard.util import sanitize_date

import pytest


@pytest.fixture
def norm() -> Callable[[str], str]:
return sanitize_date


@pytest.mark.parametrize(
("src", "expected"),
[
("", ""),
("0", ""),
("0000", ""),
("2006", "2006"),
("2006--", "2006"),
("2006-00-02", "2006-00-02"),
("2006 ", "2006"),
("2006 02", ""),
("2006.02", ""),
("2006-02", "2006-02"),
("2006-02-00", "2006-02"),
("2006-00-00", "2006"),
("2006-02-23", "2006-02-23"),
("2006-00-23", "2006-00-23"),
("0000-00-23", "0000-00-23"),
("0000-02", "0000-02"),
("--23", "0000-00-23"),
],
)
def test_sanitize_date_basic(norm: Callable, src: str, expected: str) -> None:
assert norm(src) == expected


@pytest.mark.parametrize(
("src", "expected"),
[
("2005-12-00", "2005-12"),
("2005-00-12", "2005-00-12"), # bugfix: don't shift 00 month
("0000-00-12", "0000-00-12"), # bugfix: don't become 0012
("0000-00-00", ""),
],
)
def test_sanitize_date_bug_cases(norm: Callable, src: str, expected: str) -> None:
assert norm(src) == expected


@pytest.mark.parametrize(
("src", "expected"),
[
("31/12/2005", "2005-12-31"),
("12/31/2005", "2005-12-31"),
("20051231", "2005-12-31"),
("20053112", "2005-12-31"),
],
)
def test_sanitize_date_other_formats(norm: Callable, src: str, expected: str) -> None:
assert norm(src) == expected


@pytest.mark.parametrize(
"src",
[
("nonsense",),
("2006/13/01",),
("2006-13-01",),
],
)
def test_sanitize_date_invalid(norm: Callable, src: str) -> None:
assert norm(src) == ""


@pytest.mark.parametrize(
"src",
[
"0000-00-00",
"2005-00-12",
"2005-12-00",
],
)
def test_disable_sanitization_returns_input(src: str) -> None:
assert sanitize_date(src, disable_sanitization=True) == src
Loading