tensorflow
diff --git a/‎tensorflow_datasets/testing/test_data/fake_examples/wikipedia/enwiki_fake.xml
Lines changed: 0 additions & 103 deletions b/‎tensorflow_datasets/testing/test_data/fake_examples/wikipedia/enwiki_fake.xml
Lines changed: 0 additions & 103 deletions
diff --git a/‎tensorflow_datasets/testing/test_data/fake_examples/wikipedia/enwiki_fake.xml.bz2
1.18 KB b/‎tensorflow_datasets/testing/test_data/fake_examples/wikipedia/enwiki_fake.xml.bz2
1.18 KB
diff --git a/‎tensorflow_datasets/testing/test_data/fake_examples/wikipedia/enwiki_fake2.xml
Lines changed: 0 additions & 103 deletions b/‎tensorflow_datasets/testing/test_data/fake_examples/wikipedia/enwiki_fake2.xml
Lines changed: 0 additions & 103 deletions
diff --git a/‎tensorflow_datasets/testing/test_data/fake_examples/wikipedia/enwiki_fake2.xml.bz2
1.18 KB b/‎tensorflow_datasets/testing/test_data/fake_examples/wikipedia/enwiki_fake2.xml.bz2
1.18 KB
diff --git a/‎tensorflow_datasets/text/wikipedia.py
Lines changed: 21 additions & 6 deletions b/‎tensorflow_datasets/text/wikipedia.py
Lines changed: 21 additions & 6 deletions
diff --git a/‎tensorflow_datasets/text/wikipedia_test.py
Lines changed: 1 addition & 1 deletion b/‎tensorflow_datasets/text/wikipedia_test.py
Lines changed: 1 addition & 1 deletion
@@ -29,6 +29,12 @@
 import tensorflow.compat.v2 as tf
 import tensorflow_datasets.public_api as tfds
 
+if six.PY3:
+  import bz2  # pylint:disable=g-import-not-at-top
+else:
+  # py2's built-in bz2 package does not support reading from file objects.
+  import bz2file as bz2  # pylint:disable=g-import-not-at-top
+
 
 _CITATION = """\
 @ONLINE {wikidump,
@@ -179,7 +185,7 @@ def _base_url(lang):
       xml_urls.append(_base_url(lang) + fname)
 
       # Use dictionary since testing mock always returns the same result.
-    downloaded_files = dl_manager.download_and_extract({"xml": xml_urls})
+    downloaded_files = dl_manager.download({"xml": xml_urls})
 
     return [
         tfds.core.SplitGenerator(  # pylint:disable=g-complex-comprehension
@@ -196,38 +202,46 @@ def _extract_content(filepath):
       """Extracts article content from a single WikiMedia XML file."""
       logging.info("generating examples from = %s", filepath)
       with tf.io.gfile.GFile(filepath, "rb") as f:
+        f = bz2.BZ2File(filename=f)
         if six.PY3:
           # Workaround due to:
           # https://github.com/tensorflow/tensorflow/issues/33563
           utf_f = codecs.getreader("utf-8")(f)
         else:
           utf_f = f
-        for _, elem in etree.iterparse(utf_f, events=("end",)):
+
+        # To clear root, to free-up more memory than just `elem.clear()`.
+        context = etree.iterparse(utf_f, events=("end",))
+        context = iter(context)
+        unused_event, root = next(context)
+        for unused_event, elem in context:
           if not elem.tag.endswith("page"):
             continue
           namespace = elem.tag[:-4]
           title = elem.find("./{0}title".format(namespace)).text
           ns = elem.find("./{0}ns".format(namespace)).text
+          id_ = elem.find("./{0}id".format(namespace)).text
 
           # Filter pages that are not in the "main" namespace.
           if ns != "0":
+            root.clear()
             continue
 
           raw_content = elem.find(
               "./{0}revision/{0}text".format(namespace)).text
-          elem.clear()
+          root.clear()
 
           # Filter redirects.
           if raw_content is None or raw_content.lower().startswith("#redirect"):
             beam.metrics.Metrics.counter(language, "filtered-redirects").inc()
             continue
 
           beam.metrics.Metrics.counter(language, "extracted-examples").inc()
-          yield (title, raw_content)
+          yield (id_, title, raw_content)
 
     def _clean_content(inputs):
       """Cleans raw wikicode to extract text."""
-      title, raw_content = inputs
+      id_, title, raw_content = inputs
       try:
         text = _parse_and_clean_wikicode(raw_content)
       except (
@@ -242,7 +256,7 @@ def _clean_content(inputs):
 
       beam.metrics.Metrics.counter(language, "cleaned-examples").inc()
 
-      yield title, {
+      yield id_, {
           "title": title,
           "text": text
       }
@@ -251,6 +265,7 @@ def _clean_content(inputs):
         pipeline
         | beam.Create(filepaths)
         | beam.FlatMap(_extract_content)
+        | beam.transforms.Reshuffle()
         | beam.FlatMap(_clean_content)
     )
 
 
@@ -31,7 +31,7 @@ class WikipediaTest(testing.DatasetBuilderTestCase):
 
   DL_EXTRACT_RESULT = {
       "info": "dumpstatus.json",
-      "xml": ["enwiki_fake.xml", "enwiki_fake2.xml"]
+      "xml": ["enwiki_fake.xml.bz2", "enwiki_fake2.xml.bz2"]
   }
 
   SPLITS = {
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ class WikipediaTest(testing.DatasetBuilderTestCase):`
`31`	`31`
`32`	`32`	`DL_EXTRACT_RESULT = {`
`33`	`33`	`"info": "dumpstatus.json",`
`34`		`- "xml": ["enwiki_fake.xml", "enwiki_fake2.xml"]`
	`34`	`+ "xml": ["enwiki_fake.xml.bz2", "enwiki_fake2.xml.bz2"]`
`35`	`35`	`}`
`36`	`36`
`37`	`37`	`SPLITS = {`