Skip to content

Commit 43ef317

Browse files
pierrot0copybara-github
authored andcommitted
TFDS: fix wikipedia: use id instead of title to key records, make pipeline fast.
PiperOrigin-RevId: 289153769
1 parent 9bc6103 commit 43ef317

File tree

6 files changed

+22
-213
lines changed

6 files changed

+22
-213
lines changed

tensorflow_datasets/testing/test_data/fake_examples/wikipedia/enwiki_fake.xml

Lines changed: 0 additions & 103 deletions
This file was deleted.
Binary file not shown.

tensorflow_datasets/testing/test_data/fake_examples/wikipedia/enwiki_fake2.xml

Lines changed: 0 additions & 103 deletions
This file was deleted.

tensorflow_datasets/text/wikipedia.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,12 @@
2929
import tensorflow.compat.v2 as tf
3030
import tensorflow_datasets.public_api as tfds
3131

32+
if six.PY3:
33+
import bz2 # pylint:disable=g-import-not-at-top
34+
else:
35+
# py2's built-in bz2 package does not support reading from file objects.
36+
import bz2file as bz2 # pylint:disable=g-import-not-at-top
37+
3238

3339
_CITATION = """\
3440
@ONLINE {wikidump,
@@ -179,7 +185,7 @@ def _base_url(lang):
179185
xml_urls.append(_base_url(lang) + fname)
180186

181187
# Use dictionary since testing mock always returns the same result.
182-
downloaded_files = dl_manager.download_and_extract({"xml": xml_urls})
188+
downloaded_files = dl_manager.download({"xml": xml_urls})
183189

184190
return [
185191
tfds.core.SplitGenerator( # pylint:disable=g-complex-comprehension
@@ -196,38 +202,46 @@ def _extract_content(filepath):
196202
"""Extracts article content from a single WikiMedia XML file."""
197203
logging.info("generating examples from = %s", filepath)
198204
with tf.io.gfile.GFile(filepath, "rb") as f:
205+
f = bz2.BZ2File(filename=f)
199206
if six.PY3:
200207
# Workaround due to:
201208
# https://github.com/tensorflow/tensorflow/issues/33563
202209
utf_f = codecs.getreader("utf-8")(f)
203210
else:
204211
utf_f = f
205-
for _, elem in etree.iterparse(utf_f, events=("end",)):
212+
213+
# To clear root, to free-up more memory than just `elem.clear()`.
214+
context = etree.iterparse(utf_f, events=("end",))
215+
context = iter(context)
216+
unused_event, root = next(context)
217+
for unused_event, elem in context:
206218
if not elem.tag.endswith("page"):
207219
continue
208220
namespace = elem.tag[:-4]
209221
title = elem.find("./{0}title".format(namespace)).text
210222
ns = elem.find("./{0}ns".format(namespace)).text
223+
id_ = elem.find("./{0}id".format(namespace)).text
211224

212225
# Filter pages that are not in the "main" namespace.
213226
if ns != "0":
227+
root.clear()
214228
continue
215229

216230
raw_content = elem.find(
217231
"./{0}revision/{0}text".format(namespace)).text
218-
elem.clear()
232+
root.clear()
219233

220234
# Filter redirects.
221235
if raw_content is None or raw_content.lower().startswith("#redirect"):
222236
beam.metrics.Metrics.counter(language, "filtered-redirects").inc()
223237
continue
224238

225239
beam.metrics.Metrics.counter(language, "extracted-examples").inc()
226-
yield (title, raw_content)
240+
yield (id_, title, raw_content)
227241

228242
def _clean_content(inputs):
229243
"""Cleans raw wikicode to extract text."""
230-
title, raw_content = inputs
244+
id_, title, raw_content = inputs
231245
try:
232246
text = _parse_and_clean_wikicode(raw_content)
233247
except (
@@ -242,7 +256,7 @@ def _clean_content(inputs):
242256

243257
beam.metrics.Metrics.counter(language, "cleaned-examples").inc()
244258

245-
yield title, {
259+
yield id_, {
246260
"title": title,
247261
"text": text
248262
}
@@ -251,6 +265,7 @@ def _clean_content(inputs):
251265
pipeline
252266
| beam.Create(filepaths)
253267
| beam.FlatMap(_extract_content)
268+
| beam.transforms.Reshuffle()
254269
| beam.FlatMap(_clean_content)
255270
)
256271

tensorflow_datasets/text/wikipedia_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ class WikipediaTest(testing.DatasetBuilderTestCase):
3131

3232
DL_EXTRACT_RESULT = {
3333
"info": "dumpstatus.json",
34-
"xml": ["enwiki_fake.xml", "enwiki_fake2.xml"]
34+
"xml": ["enwiki_fake.xml.bz2", "enwiki_fake2.xml.bz2"]
3535
}
3636

3737
SPLITS = {

0 commit comments

Comments
 (0)