Add C4 validation set.

adarob · copybara-github · commit dc484fd90bc6 · 2020-01-22T10:27:01.000-08:00
Changes to avoid hanging on DataFlow:
1. Filter candidate pages from C4 with any word &gt; 1k characters.
2. Modify deduplication for efficiency.

PiperOrigin-RevId: 290978909
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/c4/cc_0.warc.wet b/tensorflow_datasets/testing/test_data/fake_examples/c4/cc_0.warc.wet
@@ -18,7 +18,7 @@ publisher: Common Crawl
 
 WARC/1.0
 WARC-Type: conversion
-WARC-Target-URI: http://fake/url/1
+WARC-Target-URI: http://fake.com/url/1
 WARC-Date: 2019-04-18T10:38:25Z
 WARC-Record-ID: <urn:uuid:23d84b81-bffc-4ed9-90ae-55a429dbf541>
 WARC-Refers-To: <urn:uuid:716838d3-9c47-4879-9767-a891453e47c0>
@@ -31,7 +31,7 @@ And here is some more text. One more sentence to make this long enough.
 
 WARC/1.0
 WARC-Type: conversion
-WARC-Target-URI: http://fake/url/2
+WARC-Target-URI: http://fake.com/url/2
 WARC-Date: 2019-04-18T11:05:28Z
 WARC-Record-ID: <urn:uuid:1c03ff34-fe12-4dd7-9857-149952095cbb>
 WARC-Refers-To: <urn:uuid:7da148a2-62dd-46ba-9d15-16d52764dc64>
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/c4/cc_0.warc.wet.gz b/tensorflow_datasets/testing/test_data/fake_examples/c4/cc_0.warc.wet.gz
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/c4/cc_1.warc.wet b/tensorflow_datasets/testing/test_data/fake_examples/c4/cc_1.warc.wet
@@ -17,7 +17,7 @@ publisher: Common Crawl
 
 WARC/1.0
 WARC-Type: conversion
-WARC-Target-URI: http://fake/url/3
+WARC-Target-URI: http://fake.com/url/370
 WARC-Date: 2019-04-18T10:17:44Z
 WARC-Record-ID: <urn:uuid:82b6e950-7336-4422-a6e8-e982fb766421>
 WARC-Refers-To: <urn:uuid:741ebaf8-8ad6-4b15-80a9-5c70c2edab4d>
@@ -27,12 +27,12 @@ Content-Length: 6873
 
 Wow! This is so exciting!
 Here, we have a third. Yes, a third!
-THREE FAKE WEBPAGES, HA HA HA!
+THREE FAKE WEBPAGES, HA HA HA! Also, its URL makes it end up validation.
 
 
 WARC/1.0
 WARC-Type: conversion
-WARC-Target-URI: http://fake/url/4
+WARC-Target-URI: http://fake.com/url/4
 WARC-Date: 2019-04-18T10:17:44Z
 WARC-Record-ID: <urn:uuid:82b6e950-7336-4422-a6e8-e982fb766421>
 WARC-Refers-To: <urn:uuid:741ebaf8-8ad6-4b15-80a9-5c70c2edab4d>
@@ -47,7 +47,7 @@ What is the badword? It is "butt".
 
 WARC/1.0
 WARC-Type: conversion
-WARC-Target-URI: http://fake/url/1
+WARC-Target-URI: http://fake.com/url/1
 WARC-Date: 2019-04-18T10:38:25Z
 WARC-Record-ID: <urn:uuid:23d84b81-bffc-4ed9-90ae-55a429dbf541>
 WARC-Refers-To: <urn:uuid:716838d3-9c47-4879-9767-a891453e47c0>
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/c4/cc_1.warc.wet.gz b/tensorflow_datasets/testing/test_data/fake_examples/c4/cc_1.warc.wet.gz
diff --git a/tensorflow_datasets/text/c4.py b/tensorflow_datasets/text/c4.py
@@ -47,8 +47,7 @@
   eprint = {1910.10683},
 }
 """
-_VERSION = tfds.core.Version(
-    "2.0.0", "New split API (https://tensorflow.org/datasets/splits)")
+_VERSION = tfds.core.Version("2.2.0")
 
 _SUPPORTED_VERSIONS = [
     tfds.core.Version(
@@ -174,7 +173,7 @@ def _info(self):
         "https://github.com/google-research/text-to-text-transfer-transformer#datasets",
     )
 
-  def _split_generators(self, dl_manager):
+  def _split_generators(self, dl_manager, pipeline):
     dl_manager.download_checksums(_CHECKSUMS_URL)
 
     # We will automatically down the default CC version(s), but others need to
@@ -223,15 +222,28 @@ def _split_generators(self, dl_manager):
           len(wet_files), cc_version)
       file_paths["wet_files"].extend(wet_files)
 
+    page_content_pcollection = self._get_page_content(pipeline, file_paths)
     return [
         tfds.core.SplitGenerator(
             name=tfds.Split.TRAIN,
-            gen_kwargs={"file_paths": file_paths},
-        )
+            gen_kwargs=dict(
+                split="train",
+                page_content=page_content_pcollection,
+                hashed_url_predicate=lambda x: x % 1000 != 0  # 99.9%
+            ),
+        ),
+        tfds.core.SplitGenerator(
+            name=tfds.Split.VALIDATION,
+            gen_kwargs=dict(
+                split="validation",
+                page_content=page_content_pcollection,
+                hashed_url_predicate=lambda x: x % 1000 == 0  # 0.01%
+            ),
+        ),
     ]
 
-  def _build_pcollection(self, pipeline, file_paths):
-    """Build PCollection of examples in the raw (text) form."""
+  def _get_page_content(self, pipeline, file_paths):
+    """Build PCollection of un-split page content."""
     beam = tfds.core.lazy_imports.apache_beam
 
     # Parse WET files and filter by length.
@@ -294,11 +306,14 @@ def _build_pcollection(self, pipeline, file_paths):
     page_content |= beam.Filter(
         c4_utils.is_language, language=self.builder_config.lang)
 
-    # Emit final examples.
-    # Output: {"url": url, "text": text, "content-type": content-type,\
-    #          "content-length": content-length, "timestamp": timestamp}
+    return page_content
+
+  def _build_pcollection(
+      self, unused_pipeline, split, page_content, hashed_url_predicate):
+    beam = tfds.core.lazy_imports.apache_beam
+
     def _emit_examples(el):
-      c4_utils.get_counter_inc_fn("emit-examples")("emitted")
+      c4_utils.get_counter_inc_fn(split)("examples")
       _, features = el
       return features["url"], {
           "url": features["url"],
@@ -307,5 +322,7 @@ def _emit_examples(el):
           "content-length": features["content-length"],
           "timestamp": features["timestamp"]
       }
-
-    return page_content | beam.Map(_emit_examples)
+    return (page_content
+            | beam.Filter(
+                c4_utils.get_hashed_url_filter_fn(hashed_url_predicate))
+            | beam.Map(_emit_examples))
diff --git a/tensorflow_datasets/text/c4_test.py b/tensorflow_datasets/text/c4_test.py
@@ -39,15 +39,17 @@ class C4Test(testing.DatasetBuilderTestCase):
       "badwords": "badwords.txt",
   }
   SPLITS = {
-      "train": 2,
+      "train": 1,
+      "validation": 1,
   }
 
 
 class C4NoCleanTest(C4Test):
   # GzipFile + GFile and TextIOWrapper are broken for py2.
   BUILDER_CONFIG_NAMES_TO_TEST = ["en.noclean"] if six.PY3 else []
   SPLITS = {
-      "train": 4,
+      "train": 3,
+      "validation": 1,
   }
 
 
diff --git a/tensorflow_datasets/text/c4_utils.py b/tensorflow_datasets/text/c4_utils.py
@@ -42,6 +42,7 @@
 # Filters
 _MIN_WORDS_PER_LINE = 5
 _MIN_NUM_SENTENCES = 3
+_MAX_WORD_LENGTH = 1000
 _END_MARKS = (".", "?", "!", "\"")
 _ELLIPSIS = "..."
 _POLICY_SUBSTRINGS = [
@@ -59,6 +60,15 @@ def counter_inc_fn(counter, amt=1):
   return counter_inc_fn
 
 
+def get_hashed_url_filter_fn(predicate_fn):
+  def filter_fn(el):
+    url, _ = el
+    val = int(
+        hashlib.md5(tf.compat.as_text(url).encode("utf-8")).hexdigest(), 16)
+    return predicate_fn(val)
+  return filter_fn
+
+
 def _load_sentence_tokenizer():
   """Returns a sentence tokenization function."""
   nltk = tfds.core.lazy_imports.nltk
@@ -134,7 +144,8 @@ def clean_page(url_and_features,
                badwords_regex=None,
                counter_inc_fn=None,
                min_words_per_line=_MIN_WORDS_PER_LINE,
-               min_num_sentences=_MIN_NUM_SENTENCES):
+               min_num_sentences=_MIN_NUM_SENTENCES,
+               max_word_length=_MAX_WORD_LENGTH):
   """Cleans a CommonCrawl page, yielding nothing if it should be skipped.
 
   Cleaning removes lines with no end marks or with too few words. After line
@@ -152,6 +163,8 @@ def clean_page(url_and_features,
       removed.
     min_num_sentences: int, the minimum number of sentences a page needs to not
       be skipped.
+    max_word_length: int, the maximum number of characters allowed in a word.
+      Lines containing a word with too many characters are removed.
   Yields:
     The url and cleaned text for the page.
   """
@@ -165,8 +178,17 @@ def clean_page(url_and_features,
   valid_lines = []
   num_sentences = 0
 
+  def line_has_too_long_word(line):
+    for word in line.split():
+      if len(word) > max_word_length:
+        return True
+    return False
+
   for line in lines:
     line = line.strip()
+    if line_has_too_long_word(line):
+      counter_inc_fn("lines-with-too-long-word")
+      continue
     line = citation_regex.sub("", line)
     if not line.endswith(_END_MARKS) or line.endswith(_ELLIPSIS):
       counter_inc_fn("lines-no-endmark")
@@ -177,15 +199,15 @@ def clean_page(url_and_features,
     line_lower = line.lower()
     # Remove documents which contain lorem ipsum
     if "lorem ipsum" in line_lower:
-      counter_inc_fn("filtered-url-loremipsum")
+      counter_inc_fn("filtered-page-loremipsum")
       return
     # Remove "javascript must be enabled" notices
     if "javascript" in line_lower:
       counter_inc_fn("lines-javascript")
       continue
     # Remove docs which probably contain javascript code
     if "{" in line:
-      counter_inc_fn("filtered-url-squigglybracket")
+      counter_inc_fn("filtered-page-squigglybracket")
       return
     # Remove policy lines
     if any(p in line_lower for p in _POLICY_SUBSTRINGS):
@@ -195,14 +217,14 @@ def clean_page(url_and_features,
     if badwords_regex:
       badwords_found = badwords_regex.search(line_lower)
       if badwords_found is not None:
-        counter_inc_fn("filtered-url-badword")
+        counter_inc_fn("filtered-page-badword")
         return
     num_sentences += len(_get_sentences(line))
     valid_lines.append(line)
     counter_inc_fn("lines-valid")
 
   if num_sentences < min_num_sentences:
-    counter_inc_fn("filtered-url-toofewsentences")
+    counter_inc_fn("filtered-page-toofewsentences")
     return
   counter_inc_fn("emitted-clean-pages")
   features["text"] = "\n".join(valid_lines).strip()
@@ -223,20 +245,21 @@ def _emit_url_to_lines(page):
     yield _hash_line(line), url
 
 
-def _emit_line_to_urls(el, counter_inc_fn, skip_n=1):
-  """Emits (hashed) line to all but `skip_n` urls."""
+def _emit_line_to_urls(el, counter_inc_fn):
+  """Emits (hashed) line to all but one url."""
   line, urls = el
+  # Materialize urls as a list.
+  urls = list(urls)
   # Hash urls and sort to have a consistent, but unbiased, selection when the
   # same urls exist for multiple lines.
-  sorted_urls = sorted(
+  skip_url = min(
       urls,
       key=lambda x: hashlib.md5(tf.compat.as_text(x).encode("utf-8")).
       hexdigest())
-  del sorted_urls[:skip_n]
-  if sorted_urls:
-    counter_inc_fn("emitted-lines-duplicate")
-    for url in sorted_urls:
+  for url in urls:
+    if url != skip_url:
       yield url, line
+  counter_inc_fn("emitted-line-duplicate", amt=len(urls)-1)
 
 
 def _remove_lines_from_text(
@@ -398,11 +421,11 @@ def dedupe_urls(el):
 
 
 def is_valid_length(el, max_length=1.9e5):
-  """Returns False iff page's content is too long."""
+  """Returns False iff page's text is too long."""
   counter_inc_fn = get_counter_inc_fn("is-valid-length")
-  _, content = el
-  if len(content) > max_length:
-    counter_inc_fn("filtered-url-contenttoolong")
+  _, page = el
+  if len(page["text"]) > max_length:
+    counter_inc_fn("filtered-page-contenttoolong")
     return False
   counter_inc_fn("valid-length")
   return True
diff --git a/tensorflow_datasets/text/c4_utils_test.py b/tensorflow_datasets/text/c4_utils_test.py
@@ -98,7 +98,7 @@ def test_clean_page_toofewsentences(self):
     self.assertEqual(None, clean_en)
     self.assertEqual({
         "lines-valid": 2,
-        "filtered-url-toofewsentences": 1
+        "filtered-page-toofewsentences": 1
     }, dict(counters))
 
   def test_clean_page_squigglybracket(self):
@@ -114,7 +114,7 @@ def test_clean_page_squigglybracket(self):
     })
     self.assertEqual(None, clean_en)
     self.assertEqual({
-        "filtered-url-squigglybracket": 1,
+        "filtered-page-squigglybracket": 1,
         "lines-valid": 3
     }, dict(counters))
 
@@ -130,7 +130,7 @@ def test_clean_page_loremipsum(self):
         "timestamp": FAKE_TIMESTAMP
     })
     self.assertEqual(None, clean_en)
-    self.assertEqual({"filtered-url-loremipsum": 1}, dict(counters))
+    self.assertEqual({"filtered-page-loremipsum": 1}, dict(counters))
 
   def test_clean_page_badwords(self):
     padding_text = """This page starts out with some text.
@@ -155,15 +155,15 @@ def test_clean_page_badwords(self):
         },
         {
             "lines-valid": 3,
-            "filtered-url-badword": 1
+            "filtered-page-badword": 1
         },
         {
             "lines-valid": 3,
-            "filtered-url-badword": 1
+            "filtered-page-badword": 1
         },
         {
             "lines-valid": 3,
-            "filtered-url-badword": 1
+            "filtered-page-badword": 1
         },
     ]
     for final_sentence, output_should_be_none, expected_counter in zip(

Original file line number	Diff line number	Diff line change
`@@ -39,15 +39,17 @@ class C4Test(testing.DatasetBuilderTestCase):`
`39`	`39`	`"badwords": "badwords.txt",`
`40`	`40`	`}`
`41`	`41`	`SPLITS = {`
`42`		`- "train": 2,`
	`42`	`+ "train": 1,`
	`43`	`+ "validation": 1,`
`43`	`44`	`}`
`44`	`45`
`45`	`46`
`46`	`47`	`class C4NoCleanTest(C4Test):`
`47`	`48`	`# GzipFile + GFile and TextIOWrapper are broken for py2.`
`48`	`49`	`BUILDER_CONFIG_NAMES_TO_TEST = ["en.noclean"] if six.PY3 else []`
`49`	`50`	`SPLITS = {`
`50`		`- "train": 4,`
	`51`	`+ "train": 3,`
	`52`	`+ "validation": 1,`
`51`	`53`	`}`
`52`	`54`
`53`	`55`