diff --git a/wagtail_localize/models.py b/wagtail_localize/models.py index f983a3d2..0be9e551 100644 --- a/wagtail_localize/models.py +++ b/wagtail_localize/models.py @@ -1,4 +1,5 @@ import json +import re import uuid import polib @@ -69,6 +70,9 @@ from .tasks import background +WHITESPACE_RE = re.compile(r"[ \t\n\f\r]+") + + def pk(obj): """ A helper that gets the primary key of a model instance if one is passed in. @@ -1456,10 +1460,13 @@ def from_value(cls, locale, stringvalue): Returns: String: The String instance that corresponds with the given stringvalue and locale. """ + + data = re.sub(WHITESPACE_RE, " ", stringvalue.data) + string, created = cls.objects.get_or_create( locale_id=pk(locale), - data_hash=cls._get_data_hash(stringvalue.data), - defaults={"data": stringvalue.data}, + data_hash=cls._get_data_hash(data), + defaults={"data": data}, ) return string @@ -1707,6 +1714,11 @@ def from_text(cls, translation_of, locale, context, data): Returns: String: The String instance that corresponds with the given stringvalue and locale. """ + + # normalise whitespace sequences to a single space unless whitespace is contained in
tag,
+ # in which case, leave it alone
+ # This is in line with https://www.w3.org/TR/html4/struct/text.html#h-9.1
+
segment, created = cls.objects.get_or_create(
translation_of=translation_of,
locale_id=pk(locale),
diff --git a/wagtail_localize/tests/test_translationsource_model.py b/wagtail_localize/tests/test_translationsource_model.py
index 1baee3a9..f49248cc 100644
--- a/wagtail_localize/tests/test_translationsource_model.py
+++ b/wagtail_localize/tests/test_translationsource_model.py
@@ -614,6 +614,31 @@ def test_convert_alias(self):
)
+class TestStringNormalization(TestCase):
+ def test_whitespace_normalization(self):
+ # Test normalization of whitespace characters
+ string_value = StringValue(
+ "This is a test with multiple spaces\tand\ttabs\nand\nnew\nlines"
+ )
+ normalized_string = string_value.normalize_whitespace()
+ self.assertEqual(
+ normalized_string,
+ "This is a test with multiple spaces and tabs and new lines",
+ )
+
+ def test_unicode_normalization(self):
+ # Test normalization of unicode characters
+ string_value = StringValue("Café")
+ normalized_string = string_value.normalize_unicode()
+ self.assertEqual(normalized_string, "Cafe")
+
+ def test_combined_normalization(self):
+ # Test combined normalization of whitespace and unicode characters
+ string_value = StringValue("Café\twith\nnew\nlines\nand multiple spaces")
+ normalized_string = string_value.normalize()
+ self.assertEqual(normalized_string, "Cafe with new lines and multiple spaces")
+
+
class TestCreateOrUpdateTranslationForSnippet(TestCase):
def setUp(self):
self.snippet = TestSnippet.objects.create(