Skip to content

Commit f567c68

Browse files
adarobcopybara-github
authored andcommitted
Filter empty wikipedia pages after cleaning.
PiperOrigin-RevId: 280464382
1 parent 828684e commit f567c68

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

tensorflow_datasets/text/wikipedia.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ class Wikipedia(tfds.core.BeamBasedBuilder):
116116
BUILDER_CONFIGS = [
117117
WikipediaConfig( # pylint:disable=g-complex-comprehension
118118
version=tfds.core.Version(
119-
"0.0.3", experiments={tfds.core.Experiment.S3: False}),
119+
"0.0.4", experiments={tfds.core.Experiment.S3: False}),
120120
language=lang,
121121
date="20190301",
122122
) for lang in WIKIPEDIA_LANGUAGES
@@ -224,6 +224,10 @@ def _clean_content(inputs):
224224
logging.error("mwparserfromhell ParseError: %s", e)
225225
return
226226

227+
if not text:
228+
beam.metrics.Metrics.counter(language, "empty-clean-examples").inc()
229+
return
230+
227231
beam.metrics.Metrics.counter(language, "cleaned-examples").inc()
228232

229233
yield {

0 commit comments

Comments
 (0)