19
19
from __future__ import division
20
20
from __future__ import print_function
21
21
22
+ import collections
22
23
import tensorflow as tf
23
24
from tensorflow_datasets .core import api_utils
25
+ from tensorflow_datasets .core import utils
24
26
import tensorflow_datasets .public_api as tfds
25
27
26
28
_DESCRIPTION = ("Web-Scale Parallel Corpora for Official European Languages. "
40
42
"paracrawl/release4/en-{target_lang}.bicleaner07."
41
43
"txt.gz" )
42
44
43
- _TARGET_LANGUAGES = {
44
- "bg" : "Bulgarian" ,
45
- "da" : "Danish" ,
46
- "el" : "Greek" ,
47
- "sk" : "Slovak" ,
48
- "sl" : "Slovenian" ,
49
- "sv" : "Swedish" ,
50
- "ga" : "Irish" ,
51
- "hr" : "Croatian" ,
52
- "mt" : "Maltese" ,
53
- "lt" : "Lithuanian" ,
54
- "hu" : "Hungarian" ,
55
- "et" : "Estonian" ,
56
- "de" : "German" ,
57
- "fr" : "French" ,
58
- "es" : "Spanish" ,
59
- "it" : "Italian" ,
60
- "pt" : "Portuguese" ,
61
- "nl" : "Dutch" ,
62
- "pl" : "Polish" ,
63
- "cs" : "Czech" ,
64
- "ro" : "Romanian" ,
65
- "fi" : "Finnish" ,
66
- "lv" : "Latvian"
67
- }
45
+
46
+ @utils .memoize ()
47
+ def _target_languages ():
48
+ """Create the sorted dictionary of language codes, and language names.
49
+
50
+ Returns:
51
+ The sorted dictionary as an instance of `collections.OrderedDict`.
52
+ """
53
+ langs = {
54
+ "bg" : "Bulgarian" ,
55
+ "cs" : "Czech" ,
56
+ "da" : "Danish" ,
57
+ "de" : "German" ,
58
+ "el" : "Greek" ,
59
+ "es" : "Spanish" ,
60
+ "et" : "Estonian" ,
61
+ "fi" : "Finnish" ,
62
+ "fr" : "French" ,
63
+ "ga" : "Irish" ,
64
+ "hr" : "Croatian" ,
65
+ "hu" : "Hungarian" ,
66
+ "it" : "Italian" ,
67
+ "lt" : "Lithuanian" ,
68
+ "lv" : "Latvian" ,
69
+ "mt" : "Maltese" ,
70
+ "nl" : "Dutch" ,
71
+ "pl" : "Polish" ,
72
+ "pt" : "Portuguese" ,
73
+ "ro" : "Romanian" ,
74
+ "sk" : "Slovak" ,
75
+ "sl" : "Slovenian" ,
76
+ "sv" : "Swedish" ,
77
+ }
78
+ return collections .OrderedDict (sorted (langs .items ()))
68
79
69
80
70
81
class ParaCrawlConfig (tfds .core .BuilderConfig ):
@@ -83,7 +94,7 @@ def __init__(self, text_encoder_config=None, target_language=None, **kwargs):
83
94
**kwargs: Keyword arguments forwarded to super.
84
95
"""
85
96
# Validate the target language.
86
- if target_language not in _TARGET_LANGUAGES :
97
+ if target_language not in _target_languages () :
87
98
raise ValueError ("Invalid target language: %s " % target_language )
88
99
89
100
# Initialize the base class.
@@ -111,15 +122,15 @@ class ParaCrawl(tfds.core.GeneratorBasedBuilder):
111
122
# The version below does not refer to the version of the released
112
123
# database. It only indicates the version of the TFDS integration.
113
124
ParaCrawlConfig (target_language = target_language , version = "0.1.0" )
114
- for target_language , _ in _TARGET_LANGUAGES . items ()
125
+ for target_language in _target_languages ()
115
126
]
116
127
117
128
def _info (self ):
118
129
target_language = self .builder_config .target_language
119
130
return tfds .core .DatasetInfo (
120
131
builder = self ,
121
132
description = _DESCRIPTION .format (
122
- target_lang = _TARGET_LANGUAGES [target_language ]),
133
+ target_lang = _target_languages () [target_language ]),
123
134
features = tfds .features .Translation (
124
135
languages = ("en" , target_language ),
125
136
encoder_config = self .builder_config .text_encoder_config ),
0 commit comments