Skip to content

Commit 041b65e

Browse files
TensorFlow Datasets Teamcopybara-github
authored andcommitted
Sort the ParaCrawl languages in the comprehension creating the builder configs.
This make the order of builder configurations deterministic even across different Python versions. PiperOrigin-RevId: 255492630
1 parent 87cc01d commit 041b65e

File tree

1 file changed

+39
-28
lines changed

1 file changed

+39
-28
lines changed

tensorflow_datasets/translate/para_crawl.py

Lines changed: 39 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@
1919
from __future__ import division
2020
from __future__ import print_function
2121

22+
import collections
2223
import tensorflow as tf
2324
from tensorflow_datasets.core import api_utils
25+
from tensorflow_datasets.core import utils
2426
import tensorflow_datasets.public_api as tfds
2527

2628
_DESCRIPTION = ("Web-Scale Parallel Corpora for Official European Languages. "
@@ -40,31 +42,40 @@
4042
"paracrawl/release4/en-{target_lang}.bicleaner07."
4143
"txt.gz")
4244

43-
_TARGET_LANGUAGES = {
44-
"bg": "Bulgarian",
45-
"da": "Danish",
46-
"el": "Greek",
47-
"sk": "Slovak",
48-
"sl": "Slovenian",
49-
"sv": "Swedish",
50-
"ga": "Irish",
51-
"hr": "Croatian",
52-
"mt": "Maltese",
53-
"lt": "Lithuanian",
54-
"hu": "Hungarian",
55-
"et": "Estonian",
56-
"de": "German",
57-
"fr": "French",
58-
"es": "Spanish",
59-
"it": "Italian",
60-
"pt": "Portuguese",
61-
"nl": "Dutch",
62-
"pl": "Polish",
63-
"cs": "Czech",
64-
"ro": "Romanian",
65-
"fi": "Finnish",
66-
"lv": "Latvian"
67-
}
45+
46+
@utils.memoize()
47+
def _target_languages():
48+
"""Create the sorted dictionary of language codes, and language names.
49+
50+
Returns:
51+
The sorted dictionary as an instance of `collections.OrderedDict`.
52+
"""
53+
langs = {
54+
"bg": "Bulgarian",
55+
"cs": "Czech",
56+
"da": "Danish",
57+
"de": "German",
58+
"el": "Greek",
59+
"es": "Spanish",
60+
"et": "Estonian",
61+
"fi": "Finnish",
62+
"fr": "French",
63+
"ga": "Irish",
64+
"hr": "Croatian",
65+
"hu": "Hungarian",
66+
"it": "Italian",
67+
"lt": "Lithuanian",
68+
"lv": "Latvian",
69+
"mt": "Maltese",
70+
"nl": "Dutch",
71+
"pl": "Polish",
72+
"pt": "Portuguese",
73+
"ro": "Romanian",
74+
"sk": "Slovak",
75+
"sl": "Slovenian",
76+
"sv": "Swedish",
77+
}
78+
return collections.OrderedDict(sorted(langs.items()))
6879

6980

7081
class ParaCrawlConfig(tfds.core.BuilderConfig):
@@ -83,7 +94,7 @@ def __init__(self, text_encoder_config=None, target_language=None, **kwargs):
8394
**kwargs: Keyword arguments forwarded to super.
8495
"""
8596
# Validate the target language.
86-
if target_language not in _TARGET_LANGUAGES:
97+
if target_language not in _target_languages():
8798
raise ValueError("Invalid target language: %s " % target_language)
8899

89100
# Initialize the base class.
@@ -111,15 +122,15 @@ class ParaCrawl(tfds.core.GeneratorBasedBuilder):
111122
# The version below does not refer to the version of the released
112123
# database. It only indicates the version of the TFDS integration.
113124
ParaCrawlConfig(target_language=target_language, version="0.1.0")
114-
for target_language, _ in _TARGET_LANGUAGES.items()
125+
for target_language in _target_languages()
115126
]
116127

117128
def _info(self):
118129
target_language = self.builder_config.target_language
119130
return tfds.core.DatasetInfo(
120131
builder=self,
121132
description=_DESCRIPTION.format(
122-
target_lang=_TARGET_LANGUAGES[target_language]),
133+
target_lang=_target_languages()[target_language]),
123134
features=tfds.features.Translation(
124135
languages=("en", target_language),
125136
encoder_config=self.builder_config.text_encoder_config),

0 commit comments

Comments
 (0)