Skip to content

Commit dc484fd

Browse files
adarobcopybara-github
authored andcommitted
Add C4 validation set.
Changes to avoid hanging on DataFlow: 1. Filter candidate pages from C4 with any word > 1k characters. 2. Modify deduplication for efficiency. PiperOrigin-RevId: 290978909
1 parent 81fb84c commit dc484fd

File tree

8 files changed

+85
-43
lines changed

8 files changed

+85
-43
lines changed

tensorflow_datasets/testing/test_data/fake_examples/c4/cc_0.warc.wet

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ publisher: Common Crawl
1818

1919
WARC/1.0
2020
WARC-Type: conversion
21-
WARC-Target-URI: http://fake/url/1
21+
WARC-Target-URI: http://fake.com/url/1
2222
WARC-Date: 2019-04-18T10:38:25Z
2323
WARC-Record-ID: <urn:uuid:23d84b81-bffc-4ed9-90ae-55a429dbf541>
2424
WARC-Refers-To: <urn:uuid:716838d3-9c47-4879-9767-a891453e47c0>
@@ -31,7 +31,7 @@ And here is some more text. One more sentence to make this long enough.
3131

3232
WARC/1.0
3333
WARC-Type: conversion
34-
WARC-Target-URI: http://fake/url/2
34+
WARC-Target-URI: http://fake.com/url/2
3535
WARC-Date: 2019-04-18T11:05:28Z
3636
WARC-Record-ID: <urn:uuid:1c03ff34-fe12-4dd7-9857-149952095cbb>
3737
WARC-Refers-To: <urn:uuid:7da148a2-62dd-46ba-9d15-16d52764dc64>
Binary file not shown.

tensorflow_datasets/testing/test_data/fake_examples/c4/cc_1.warc.wet

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ publisher: Common Crawl
1717

1818
WARC/1.0
1919
WARC-Type: conversion
20-
WARC-Target-URI: http://fake/url/3
20+
WARC-Target-URI: http://fake.com/url/370
2121
WARC-Date: 2019-04-18T10:17:44Z
2222
WARC-Record-ID: <urn:uuid:82b6e950-7336-4422-a6e8-e982fb766421>
2323
WARC-Refers-To: <urn:uuid:741ebaf8-8ad6-4b15-80a9-5c70c2edab4d>
@@ -27,12 +27,12 @@ Content-Length: 6873
2727

2828
Wow! This is so exciting!
2929
Here, we have a third. Yes, a third!
30-
THREE FAKE WEBPAGES, HA HA HA!
30+
THREE FAKE WEBPAGES, HA HA HA! Also, its URL makes it end up validation.
3131

3232

3333
WARC/1.0
3434
WARC-Type: conversion
35-
WARC-Target-URI: http://fake/url/4
35+
WARC-Target-URI: http://fake.com/url/4
3636
WARC-Date: 2019-04-18T10:17:44Z
3737
WARC-Record-ID: <urn:uuid:82b6e950-7336-4422-a6e8-e982fb766421>
3838
WARC-Refers-To: <urn:uuid:741ebaf8-8ad6-4b15-80a9-5c70c2edab4d>
@@ -47,7 +47,7 @@ What is the badword? It is "butt".
4747

4848
WARC/1.0
4949
WARC-Type: conversion
50-
WARC-Target-URI: http://fake/url/1
50+
WARC-Target-URI: http://fake.com/url/1
5151
WARC-Date: 2019-04-18T10:38:25Z
5252
WARC-Record-ID: <urn:uuid:23d84b81-bffc-4ed9-90ae-55a429dbf541>
5353
WARC-Refers-To: <urn:uuid:716838d3-9c47-4879-9767-a891453e47c0>
Binary file not shown.

tensorflow_datasets/text/c4.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,7 @@
4747
eprint = {1910.10683},
4848
}
4949
"""
50-
_VERSION = tfds.core.Version(
51-
"2.0.0", "New split API (https://tensorflow.org/datasets/splits)")
50+
_VERSION = tfds.core.Version("2.2.0")
5251

5352
_SUPPORTED_VERSIONS = [
5453
tfds.core.Version(
@@ -174,7 +173,7 @@ def _info(self):
174173
"https://github.com/google-research/text-to-text-transfer-transformer#datasets",
175174
)
176175

177-
def _split_generators(self, dl_manager):
176+
def _split_generators(self, dl_manager, pipeline):
178177
dl_manager.download_checksums(_CHECKSUMS_URL)
179178

180179
# We will automatically down the default CC version(s), but others need to
@@ -223,15 +222,28 @@ def _split_generators(self, dl_manager):
223222
len(wet_files), cc_version)
224223
file_paths["wet_files"].extend(wet_files)
225224

225+
page_content_pcollection = self._get_page_content(pipeline, file_paths)
226226
return [
227227
tfds.core.SplitGenerator(
228228
name=tfds.Split.TRAIN,
229-
gen_kwargs={"file_paths": file_paths},
230-
)
229+
gen_kwargs=dict(
230+
split="train",
231+
page_content=page_content_pcollection,
232+
hashed_url_predicate=lambda x: x % 1000 != 0 # 99.9%
233+
),
234+
),
235+
tfds.core.SplitGenerator(
236+
name=tfds.Split.VALIDATION,
237+
gen_kwargs=dict(
238+
split="validation",
239+
page_content=page_content_pcollection,
240+
hashed_url_predicate=lambda x: x % 1000 == 0 # 0.01%
241+
),
242+
),
231243
]
232244

233-
def _build_pcollection(self, pipeline, file_paths):
234-
"""Build PCollection of examples in the raw (text) form."""
245+
def _get_page_content(self, pipeline, file_paths):
246+
"""Build PCollection of un-split page content."""
235247
beam = tfds.core.lazy_imports.apache_beam
236248

237249
# Parse WET files and filter by length.
@@ -294,11 +306,14 @@ def _build_pcollection(self, pipeline, file_paths):
294306
page_content |= beam.Filter(
295307
c4_utils.is_language, language=self.builder_config.lang)
296308

297-
# Emit final examples.
298-
# Output: {"url": url, "text": text, "content-type": content-type,\
299-
# "content-length": content-length, "timestamp": timestamp}
309+
return page_content
310+
311+
def _build_pcollection(
312+
self, unused_pipeline, split, page_content, hashed_url_predicate):
313+
beam = tfds.core.lazy_imports.apache_beam
314+
300315
def _emit_examples(el):
301-
c4_utils.get_counter_inc_fn("emit-examples")("emitted")
316+
c4_utils.get_counter_inc_fn(split)("examples")
302317
_, features = el
303318
return features["url"], {
304319
"url": features["url"],
@@ -307,5 +322,7 @@ def _emit_examples(el):
307322
"content-length": features["content-length"],
308323
"timestamp": features["timestamp"]
309324
}
310-
311-
return page_content | beam.Map(_emit_examples)
325+
return (page_content
326+
| beam.Filter(
327+
c4_utils.get_hashed_url_filter_fn(hashed_url_predicate))
328+
| beam.Map(_emit_examples))

tensorflow_datasets/text/c4_test.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,15 +39,17 @@ class C4Test(testing.DatasetBuilderTestCase):
3939
"badwords": "badwords.txt",
4040
}
4141
SPLITS = {
42-
"train": 2,
42+
"train": 1,
43+
"validation": 1,
4344
}
4445

4546

4647
class C4NoCleanTest(C4Test):
4748
# GzipFile + GFile and TextIOWrapper are broken for py2.
4849
BUILDER_CONFIG_NAMES_TO_TEST = ["en.noclean"] if six.PY3 else []
4950
SPLITS = {
50-
"train": 4,
51+
"train": 3,
52+
"validation": 1,
5153
}
5254

5355

tensorflow_datasets/text/c4_utils.py

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
# Filters
4343
_MIN_WORDS_PER_LINE = 5
4444
_MIN_NUM_SENTENCES = 3
45+
_MAX_WORD_LENGTH = 1000
4546
_END_MARKS = (".", "?", "!", "\"")
4647
_ELLIPSIS = "..."
4748
_POLICY_SUBSTRINGS = [
@@ -59,6 +60,15 @@ def counter_inc_fn(counter, amt=1):
5960
return counter_inc_fn
6061

6162

63+
def get_hashed_url_filter_fn(predicate_fn):
64+
def filter_fn(el):
65+
url, _ = el
66+
val = int(
67+
hashlib.md5(tf.compat.as_text(url).encode("utf-8")).hexdigest(), 16)
68+
return predicate_fn(val)
69+
return filter_fn
70+
71+
6272
def _load_sentence_tokenizer():
6373
"""Returns a sentence tokenization function."""
6474
nltk = tfds.core.lazy_imports.nltk
@@ -134,7 +144,8 @@ def clean_page(url_and_features,
134144
badwords_regex=None,
135145
counter_inc_fn=None,
136146
min_words_per_line=_MIN_WORDS_PER_LINE,
137-
min_num_sentences=_MIN_NUM_SENTENCES):
147+
min_num_sentences=_MIN_NUM_SENTENCES,
148+
max_word_length=_MAX_WORD_LENGTH):
138149
"""Cleans a CommonCrawl page, yielding nothing if it should be skipped.
139150
140151
Cleaning removes lines with no end marks or with too few words. After line
@@ -152,6 +163,8 @@ def clean_page(url_and_features,
152163
removed.
153164
min_num_sentences: int, the minimum number of sentences a page needs to not
154165
be skipped.
166+
max_word_length: int, the maximum number of characters allowed in a word.
167+
Lines containing a word with too many characters are removed.
155168
Yields:
156169
The url and cleaned text for the page.
157170
"""
@@ -165,8 +178,17 @@ def clean_page(url_and_features,
165178
valid_lines = []
166179
num_sentences = 0
167180

181+
def line_has_too_long_word(line):
182+
for word in line.split():
183+
if len(word) > max_word_length:
184+
return True
185+
return False
186+
168187
for line in lines:
169188
line = line.strip()
189+
if line_has_too_long_word(line):
190+
counter_inc_fn("lines-with-too-long-word")
191+
continue
170192
line = citation_regex.sub("", line)
171193
if not line.endswith(_END_MARKS) or line.endswith(_ELLIPSIS):
172194
counter_inc_fn("lines-no-endmark")
@@ -177,15 +199,15 @@ def clean_page(url_and_features,
177199
line_lower = line.lower()
178200
# Remove documents which contain lorem ipsum
179201
if "lorem ipsum" in line_lower:
180-
counter_inc_fn("filtered-url-loremipsum")
202+
counter_inc_fn("filtered-page-loremipsum")
181203
return
182204
# Remove "javascript must be enabled" notices
183205
if "javascript" in line_lower:
184206
counter_inc_fn("lines-javascript")
185207
continue
186208
# Remove docs which probably contain javascript code
187209
if "{" in line:
188-
counter_inc_fn("filtered-url-squigglybracket")
210+
counter_inc_fn("filtered-page-squigglybracket")
189211
return
190212
# Remove policy lines
191213
if any(p in line_lower for p in _POLICY_SUBSTRINGS):
@@ -195,14 +217,14 @@ def clean_page(url_and_features,
195217
if badwords_regex:
196218
badwords_found = badwords_regex.search(line_lower)
197219
if badwords_found is not None:
198-
counter_inc_fn("filtered-url-badword")
220+
counter_inc_fn("filtered-page-badword")
199221
return
200222
num_sentences += len(_get_sentences(line))
201223
valid_lines.append(line)
202224
counter_inc_fn("lines-valid")
203225

204226
if num_sentences < min_num_sentences:
205-
counter_inc_fn("filtered-url-toofewsentences")
227+
counter_inc_fn("filtered-page-toofewsentences")
206228
return
207229
counter_inc_fn("emitted-clean-pages")
208230
features["text"] = "\n".join(valid_lines).strip()
@@ -223,20 +245,21 @@ def _emit_url_to_lines(page):
223245
yield _hash_line(line), url
224246

225247

226-
def _emit_line_to_urls(el, counter_inc_fn, skip_n=1):
227-
"""Emits (hashed) line to all but `skip_n` urls."""
248+
def _emit_line_to_urls(el, counter_inc_fn):
249+
"""Emits (hashed) line to all but one url."""
228250
line, urls = el
251+
# Materialize urls as a list.
252+
urls = list(urls)
229253
# Hash urls and sort to have a consistent, but unbiased, selection when the
230254
# same urls exist for multiple lines.
231-
sorted_urls = sorted(
255+
skip_url = min(
232256
urls,
233257
key=lambda x: hashlib.md5(tf.compat.as_text(x).encode("utf-8")).
234258
hexdigest())
235-
del sorted_urls[:skip_n]
236-
if sorted_urls:
237-
counter_inc_fn("emitted-lines-duplicate")
238-
for url in sorted_urls:
259+
for url in urls:
260+
if url != skip_url:
239261
yield url, line
262+
counter_inc_fn("emitted-line-duplicate", amt=len(urls)-1)
240263

241264

242265
def _remove_lines_from_text(
@@ -398,11 +421,11 @@ def dedupe_urls(el):
398421

399422

400423
def is_valid_length(el, max_length=1.9e5):
401-
"""Returns False iff page's content is too long."""
424+
"""Returns False iff page's text is too long."""
402425
counter_inc_fn = get_counter_inc_fn("is-valid-length")
403-
_, content = el
404-
if len(content) > max_length:
405-
counter_inc_fn("filtered-url-contenttoolong")
426+
_, page = el
427+
if len(page["text"]) > max_length:
428+
counter_inc_fn("filtered-page-contenttoolong")
406429
return False
407430
counter_inc_fn("valid-length")
408431
return True

tensorflow_datasets/text/c4_utils_test.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def test_clean_page_toofewsentences(self):
9898
self.assertEqual(None, clean_en)
9999
self.assertEqual({
100100
"lines-valid": 2,
101-
"filtered-url-toofewsentences": 1
101+
"filtered-page-toofewsentences": 1
102102
}, dict(counters))
103103

104104
def test_clean_page_squigglybracket(self):
@@ -114,7 +114,7 @@ def test_clean_page_squigglybracket(self):
114114
})
115115
self.assertEqual(None, clean_en)
116116
self.assertEqual({
117-
"filtered-url-squigglybracket": 1,
117+
"filtered-page-squigglybracket": 1,
118118
"lines-valid": 3
119119
}, dict(counters))
120120

@@ -130,7 +130,7 @@ def test_clean_page_loremipsum(self):
130130
"timestamp": FAKE_TIMESTAMP
131131
})
132132
self.assertEqual(None, clean_en)
133-
self.assertEqual({"filtered-url-loremipsum": 1}, dict(counters))
133+
self.assertEqual({"filtered-page-loremipsum": 1}, dict(counters))
134134

135135
def test_clean_page_badwords(self):
136136
padding_text = """This page starts out with some text.
@@ -155,15 +155,15 @@ def test_clean_page_badwords(self):
155155
},
156156
{
157157
"lines-valid": 3,
158-
"filtered-url-badword": 1
158+
"filtered-page-badword": 1
159159
},
160160
{
161161
"lines-valid": 3,
162-
"filtered-url-badword": 1
162+
"filtered-page-badword": 1
163163
},
164164
{
165165
"lines-valid": 3,
166-
"filtered-url-badword": 1
166+
"filtered-page-badword": 1
167167
},
168168
]
169169
for final_sentence, output_should_be_none, expected_counter in zip(

0 commit comments

Comments
 (0)