Reduce disk usage for C4 generation by deduping on lines (instead of sentence windows) and using hashed text as key.

adarob · copybara-github · commit 42f5bf89efcf · 2019-11-18T17:42:55.000-08:00
PiperOrigin-RevId: 281192218
diff --git a/tensorflow_datasets/text/c4.py b/tensorflow_datasets/text/c4.py
@@ -48,9 +48,11 @@
 }
 """
 _VERSION = tfds.core.Version(
-    "1.0.1", experiments={tfds.core.Experiment.S3: False})
+    "1.1.0", experiments={tfds.core.Experiment.S3: False})
 _SUPPORTED_VERSIONS = [
-    tfds.core.Version("1.0.0", experiments={tfds.core.Experiment.S3: False})]
+    tfds.core.Version("1.0.0", experiments={tfds.core.Experiment.S3: False}),
+    tfds.core.Version("1.0.1", experiments={tfds.core.Experiment.S3: False}),
+]
 
 _DOWNLOAD_HOST = "https://commoncrawl.s3.amazonaws.com"
 _WET_PATH_URL = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-{cc_version}/wet.paths.gz"
diff --git a/tensorflow_datasets/text/c4_utils.py b/tensorflow_datasets/text/c4_utils.py
@@ -209,115 +209,102 @@ def clean_page(url_and_features,
   yield url, features
 
 
-def _emit_url_to_sentences(page, max_window_size):
-  """Emits url to all (lower-cased) sentences grouped by sliding window."""
+def _hash_line(line):
+  m = hashlib.md5()
+  m.update(tf.compat.as_text(line).encode("utf-8").strip().lower())
+  return m.hexdigest()
+
+
+def _emit_url_to_lines(page):
+  """Emits url to all (lower-cased, hashed) lines."""
   url, features = page
   text = features["text"]
-  for sentences in _get_sentences_by_line(text, lower=True):
-    # We don't want to emit windows where all "sentences" are just endmarks
-    # (e.g., "! ! !").
-    is_solo_endmark = [w in _END_MARKS for w in sentences]
-    for i in range(len(sentences) - min(len(sentences), max_window_size) + 1):
-      if not all(is_solo_endmark[i:i+max_window_size]):
-        yield tuple(sentences[i:i+max_window_size]), url
-
-
-def _emit_sentences_to_urls(el, counter_inc_fn, skip_n=1):
-  """Emits sentences to all but `skip_n` urls."""
-  sentences, urls = el
+  for line in text.split("\n"):
+    yield _hash_line(line), url
+
+
+def _emit_line_to_urls(el, counter_inc_fn, skip_n=1):
+  """Emits (hashed) line to all but `skip_n` urls."""
+  line, urls = el
   # Hash urls and sort to have a consistent, but unbiased, selection when the
-  # same urls exist for multiple sentences.
+  # same urls exist for multiple lines.
   sorted_urls = sorted(
       urls,
       key=lambda x: hashlib.md5(tf.compat.as_text(x).encode("utf-8")).
       hexdigest())
   del sorted_urls[:skip_n]
   if sorted_urls:
-    counter_inc_fn("emitted-sentences-duplicate")
-    logging.info(
-        "Emitting sentences to %d documents: %s", len(sorted_urls), sentences)
+    counter_inc_fn("emitted-lines-duplicate")
     for url in sorted_urls:
-      yield url, sentences
+      yield url, line
 
 
-def _remove_sentences_from_text(
-    el, counter_inc_fn, max_window_size,
-    min_num_sentences=_MIN_NUM_SENTENCES):
-  """Removes matching sentence windows from the page.
+def _remove_lines_from_text(
+    el, counter_inc_fn, min_num_sentences=_MIN_NUM_SENTENCES):
+  """Removes matching lines from the page.
 
   Process the result of a join containing a single value for 'features' and zero
-  or more values for 'sentences'. Each value in 'sentences' is a tuple
-  containing a window of one or more sentences.
+  or more values for 'lines'. Each value in 'lines' is a lower-cased, hashed
+  line.
 
   If a line has fewer sentences than `max_window_size`, the full line is
   compared for a match.
 
   Args:
-    el: `(string, {'features': [string], 'sentences': [tuple(string)]})`,
+    el: `(string, {'features': features_dict, 'lines': [string]})`,
       element containing the result of a join on key with both the page text
-      and lower-cased sentence windows to remove.
+      and lower-cased, hashed lines to remove.
     counter_inc_fn: function, a function taking the name of a counter to be
       incremented and the (optional) amount.
-    max_window_size: int, the maximum size of a sentence window to slide across
-      lines.
     min_num_sentences: int, the minimum number of sentences a page needs to not
       be skipped.
 
   Yields:
     url: The URL of the page.
-    features: The page features with sentences removed.
+    features: The page features with lines removed from text.
   """
   url, join_values = el
   features = join_values["features"]
 
-  assert len(features) == 1, "Invalid page count (%d) for %s" % (len(features),
-                                                                 url)
+  assert len(features) == 1, "Invalid page count (%d) for %s" % (
+      len(features), url)
   features = features[0]
   text = features["text"]
-  sentences_to_remove = set(join_values["sentences"])
-  sentences_by_line = _get_sentences_by_line(text, lower=False)
-  new_sentences_by_line = []
-  for line_sentences in sentences_by_line:
-    indices_to_remove = set()
-    for i in range(
-        len(line_sentences) - min(len(line_sentences), max_window_size) + 1):
-      sentence_window = tuple(
-          s.lower() for s in line_sentences[i:i+max_window_size])
-      if sentence_window in sentences_to_remove:
-        indices_to_remove.update(range(i, i+len(sentence_window)))
-    counter_inc_fn("filtered-sentence-duplicate", len(indices_to_remove))
-    new_line_sentences = [
-        s for i, s in enumerate(line_sentences) if i not in indices_to_remove]
-    if new_line_sentences:
-      new_sentences_by_line.append(new_line_sentences)
-  if sum(len(sents) for sents in new_sentences_by_line) < min_num_sentences:
+  lines_to_remove = set(join_values["lines"])
+  new_lines = []
+  for line in text.split("\n"):
+    if _hash_line(line) in lines_to_remove:
+      counter_inc_fn("filtered-lines-duplicate")
+    else:
+      new_lines.append(line)
+  new_text = "\n".join(new_lines)
+  if len(_get_sentences(new_text)) < min_num_sentences:
     counter_inc_fn("filtered-doc-toofewsentences")
     return
-  features["text"] = "\n".join(" ".join(sent) for sent in new_sentences_by_line)
-  yield (url, features)
+  new_features = features.copy()
+  new_features["text"] = new_text
+  yield (url, new_features)
 
 
-def remove_duplicate_text(pages, sentence_window_size=3):
-  """Utility to remove duplicate sentence windows across text documents."""
-  # Output: url, sentence
+def remove_duplicate_text(pages):
+  """Utility to remove duplicate lines across text documents."""
+  # Output: url, lines
   beam = tfds.core.lazy_imports.apache_beam
-  counter_inc_fn = get_counter_inc_fn("dedupe-sentences")
-  sentences_to_remove = (
+  counter_inc_fn = get_counter_inc_fn("dedupe-lines")
+  lines_to_remove = (
       pages
-      | beam.FlatMap(_emit_url_to_sentences,
-                     max_window_size=sentence_window_size)
+      | beam.FlatMap(_emit_url_to_lines)
       | "group_sentences" >> beam.GroupByKey()
-      | beam.FlatMap(_emit_sentences_to_urls, counter_inc_fn=counter_inc_fn))
+      | beam.FlatMap(_emit_line_to_urls, counter_inc_fn=counter_inc_fn))
 
   # Output: url, text
   final_docs = ({
       "features": pages,
-      "sentences": sentences_to_remove
+      "lines": lines_to_remove
   }
-                | "group_text_and_sentences_by_url" >> beam.CoGroupByKey()
+                | "group_features_and_lines_by_url" >> beam.CoGroupByKey()
                 | beam.FlatMap(
-                    _remove_sentences_from_text,
-                    max_window_size=sentence_window_size,
+                    _remove_lines_from_text,
                     counter_inc_fn=counter_inc_fn))
 
   return final_docs
diff --git a/tensorflow_datasets/text/c4_utils_test.py b/tensorflow_datasets/text/c4_utils_test.py
@@ -25,6 +25,7 @@
 
 import six
 from tensorflow_datasets import testing
+from tensorflow_datasets.core.lazy_imports_lib import lazy_imports
 from tensorflow_datasets.text import c4_utils
 
 EN_TEXT = """This line has enough words and ends in punctuation, Dr. Roberts!
@@ -230,125 +231,45 @@ def test_clean_page_policy(self):
     self.assertEqual(expected_clean_text, out["text"])
     self.assertEqual(expected_counters, dict(counters))
 
-  def test_emit_url_to_sentences(self):
-    # Try with punkt language (en).
-    expected_sentences = (
-        ("this line has enough words and ends in punctuation, dr. roberts!",),
-        ("\"open access.", "powered by scholars.",
-         "published by universities.\""),
-        ("sentence 1.", "sentence 2.", "sentence 3."),
-        ("sentence 2.", "sentence 3.", "sentence 4."),
-        ("another sentence.", ".", "."),
-    )
-    results = c4_utils._emit_url_to_sentences(("url", {
-        "text":
-            EXPECTED_CLEAN_EN +
-            "\nSentence 1. Sentence 2. Sentence 3. Sentence 4."
-            "\nAnother sentence. . . ? ? ! ! !",
-        "content-type":
-            FAKE_CONTENT_TYPE,
-        "content-length":
-            FAKE_CONTENT_LENGTH,
-        "timestamp":
-            FAKE_TIMESTAMP
-    }),
-                                              max_window_size=3)
-    ret_sentences, ret_urls = zip(*results)
-    self.assertEqual(("url",) * len(expected_sentences), ret_urls)
-    self.assertEqual(expected_sentences, ret_sentences)
-
-  def test_emit_sentences_to_urls(self):
-    counters, counter_inc_fn = _get_counters()
-    urls = ["urlA", "urlB", "urlC", "urlD"]
-    sentence = "test sentence."
-    expected_urls = ("urlA", "urlD")
-    results = c4_utils._emit_sentences_to_urls((sentence, urls),
-                                               counter_inc_fn,
-                                               skip_n=2)
-    ret_urls, ret_sentences = zip(*results)
-    self.assertEqual(expected_urls, ret_urls)
-    self.assertEqual((sentence,) * 2, ret_sentences)
-    self.assertEqual({"emitted-sentences-duplicate": 1}, dict(counters))
-
-  def test_remove_sentences_from_page(self):
-    counters, counter_inc_fn = _get_counters()
-    sentences_to_remove = [
-        ("this line has enough words and ends in punctuation, dr. roberts!",),
-        ("sentence 1.", "sentence 2.", "sentence 3."),
-        ("sentence 3.", "sentence 4."),  # no match
-        ("sentence 1.", "sentence 3.", "sentence 4."),  # no match
-        ("sentence 3.", "sentence 4.", "sentence 5."),  # no match
+  def test_remove_duplicate_text(self):
+    import apache_beam.testing.util as beam_testing_util  # pylint:disable=g-import-not-at-top
+    beam = lazy_imports.apache_beam
+    input_urls_and_text = [
+        ("url/1-0",
+         "This is a duplicated line.\nThis is a unique line.\n"
+         "This one comes first and so it stays."),
+        ("url/2-1",
+         "This is 2nd unique line.\nThis one comes second so it is removed "
+         "even though the capitalizaiton is different.\n"
+         "this is a Duplicated line. "),
+        ("url/3-4",
+         "This is a 3rd unique line.\nThis is a duplicated line.\n"
+         "This one comes third and so it is removed. But the page stays "
+         "because there are still 3 sentences remaining."),
+        ("url/4-4",
+         "This is a 4th unique line.\nThis is a duplicated line.\n"
+         "This one comes third and so it is removed, and the page is too "
+         "since there aren't enough sentences left."),
     ]
-    text = (
-        EXPECTED_CLEAN_EN + "\nSentence 1. Sentence 2. Sentence 3. Sentence 4.")
-    expected_features = {
-        "text": ("\"Open Access. Powered by Scholars. "
-                 "Published by Universities.\"\nSentence 4."),
-        "content-type": FAKE_CONTENT_TYPE,
-        "content-length": FAKE_CONTENT_LENGTH,
-        "timestamp": FAKE_TIMESTAMP
-    }
-    result = list(
-        c4_utils._remove_sentences_from_text(("url", {
-            "features": [{
-                "text": text,
-                "content-type": FAKE_CONTENT_TYPE,
-                "content-length": FAKE_CONTENT_LENGTH,
-                "timestamp": FAKE_TIMESTAMP
-            }],
-            "sentences": sentences_to_remove
-        }),
-                                             max_window_size=3,
-                                             counter_inc_fn=counter_inc_fn))
-    self.assertEqual([("url", expected_features)], result)
-    self.assertEqual({"filtered-sentence-duplicate": 4}, dict(counters))
-
-    counters.clear()
-    sentences_to_remove.append(("sentence 2.", "sentence 3.", "sentence 4."))
-    expected_features = {
-        "text":
-            ("\"Open Access. Powered by Scholars. Published by Universities.\""
-            ),
-        "content-type": FAKE_CONTENT_TYPE,
-        "content-length": FAKE_CONTENT_LENGTH,
-        "timestamp": FAKE_TIMESTAMP
-    }
-    result = list(
-        c4_utils._remove_sentences_from_text(("url", {
-            "features": [{
-                "text": text,
-                "content-type": FAKE_CONTENT_TYPE,
-                "content-length": FAKE_CONTENT_LENGTH,
-                "timestamp": FAKE_TIMESTAMP
-            }],
-            "sentences": sentences_to_remove
-        }),
-                                             counter_inc_fn=counter_inc_fn,
-                                             max_window_size=3,
-                                             min_num_sentences=3))
-    self.assertEqual([("url", expected_features)], result)
-    self.assertEqual({"filtered-sentence-duplicate": 5}, dict(counters))
-
-    counters.clear()
-    result = list(
-        c4_utils._remove_sentences_from_text(("url", {
-            "features": [{
-                "text": text,
-                "content-type": FAKE_CONTENT_TYPE,
-                "content-length": FAKE_CONTENT_LENGTH,
-                "timestamp": FAKE_TIMESTAMP
-            }],
-            "sentences": sentences_to_remove
-        }),
-                                             counter_inc_fn=counter_inc_fn,
-                                             max_window_size=3,
-                                             min_num_sentences=4))
-    self.assertEqual([], result)
-    self.assertEqual(
-        {
-            "filtered-sentence-duplicate": 5,
-            "filtered-doc-toofewsentences": 1
-        }, dict(counters))
+    expected_urls_and_text = [
+        ("url/1-0",
+         "This is a duplicated line.\nThis is a unique line.\n"
+         "This one comes first and so it stays."),
+        ("url/3-4",
+         "This is a 3rd unique line.\n"
+         "This one comes third and so it is removed. But the page stays "
+         "because there are still 3 sentences remaining."),
+    ]
+    with beam.Pipeline() as pipeline:
+      pages = pipeline | beam.Create([
+          (url, {"text": text}) for url, text in input_urls_and_text
+      ])
+      deduped_pages = c4_utils.remove_duplicate_text(pages)
+      beam_testing_util.assert_that(
+          deduped_pages,
+          beam_testing_util.equal_to([
+              (url, {"text": text}) for url, text in expected_urls_and_text
+          ]))
 
   def test_split_wet_file(self):
     if six.PY2: