Skip to content

Commit 0ebd37f

Browse files
pierrot0copybara-github
authored andcommitted
imdb dataset: S3 version (issue #737).
PiperOrigin-RevId: 257574363
1 parent 41fbb33 commit 0ebd37f

File tree

2 files changed

+15
-4
lines changed

2 files changed

+15
-4
lines changed

tensorflow_datasets/text/imdb.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ def __init__(self, text_encoder_config=None, **kwargs):
6666
super(IMDBReviewsConfig, self).__init__(
6767
version=tfds.core.Version(
6868
"0.1.0", experiments={tfds.core.Experiment.S3: False}),
69+
supported_versions=[
70+
tfds.core.Version("1.0.0"),
71+
],
6972
**kwargs)
7073
self.text_encoder_config = (
7174
text_encoder_config or tfds.features.text.TextEncoderConfig())
@@ -118,8 +121,8 @@ def _info(self):
118121
)
119122

120123
def _vocab_text_gen(self, archive):
121-
for ex in self._generate_examples(archive,
122-
os.path.join("aclImdb", "train")):
124+
for ex in self._generate_examples(
125+
archive, os.path.join("aclImdb", "train"), keys=False):
123126
yield ex["text"]
124127

125128
def _split_generators(self, dl_manager):
@@ -149,7 +152,7 @@ def _split_generators(self, dl_manager):
149152
"labeled": False}),
150153
]
151154

152-
def _generate_examples(self, archive, directory, labeled=True):
155+
def _generate_examples(self, archive, directory, labeled=True, keys=True):
153156
"""Generate IMDB examples."""
154157
# For labeled examples, extract the label from the path.
155158
reg_path = "(?P<label>neg|pos)" if labeled else "unsup"
@@ -161,7 +164,11 @@ def _generate_examples(self, archive, directory, labeled=True):
161164
continue
162165
text = imdb_f.read().strip()
163166
label = res.groupdict()["label"] if labeled else -1
164-
yield {
167+
record = {
165168
"text": text,
166169
"label": label,
167170
}
171+
if keys and self.version.implements(tfds.core.Experiment.S3):
172+
yield path, record
173+
else:
174+
yield record

tensorflow_datasets/text/imdb_test.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,5 +33,9 @@ class IMDBReviewsTest(testing.DatasetBuilderTestCase):
3333
DL_EXTRACT_RESULT = "aclImdb_v1.tar.gz"
3434

3535

36+
class IMDBReviewsS3Test(IMDBReviewsTest):
37+
VERSION = "experimental_latest"
38+
39+
3640
if __name__ == "__main__":
3741
testing.test_main()

0 commit comments

Comments
 (0)