|
25 | 25 |
|
26 | 26 | import six
|
27 | 27 | from tensorflow_datasets import testing
|
| 28 | +from tensorflow_datasets.core.lazy_imports_lib import lazy_imports |
28 | 29 | from tensorflow_datasets.text import c4_utils
|
29 | 30 |
|
30 | 31 | EN_TEXT = """This line has enough words and ends in punctuation, Dr. Roberts!
|
@@ -230,125 +231,45 @@ def test_clean_page_policy(self):
|
230 | 231 | self.assertEqual(expected_clean_text, out["text"])
|
231 | 232 | self.assertEqual(expected_counters, dict(counters))
|
232 | 233 |
|
233 |
| - def test_emit_url_to_sentences(self): |
234 |
| - # Try with punkt language (en). |
235 |
| - expected_sentences = ( |
236 |
| - ("this line has enough words and ends in punctuation, dr. roberts!",), |
237 |
| - ("\"open access.", "powered by scholars.", |
238 |
| - "published by universities.\""), |
239 |
| - ("sentence 1.", "sentence 2.", "sentence 3."), |
240 |
| - ("sentence 2.", "sentence 3.", "sentence 4."), |
241 |
| - ("another sentence.", ".", "."), |
242 |
| - ) |
243 |
| - results = c4_utils._emit_url_to_sentences(("url", { |
244 |
| - "text": |
245 |
| - EXPECTED_CLEAN_EN + |
246 |
| - "\nSentence 1. Sentence 2. Sentence 3. Sentence 4." |
247 |
| - "\nAnother sentence. . . ? ? ! ! !", |
248 |
| - "content-type": |
249 |
| - FAKE_CONTENT_TYPE, |
250 |
| - "content-length": |
251 |
| - FAKE_CONTENT_LENGTH, |
252 |
| - "timestamp": |
253 |
| - FAKE_TIMESTAMP |
254 |
| - }), |
255 |
| - max_window_size=3) |
256 |
| - ret_sentences, ret_urls = zip(*results) |
257 |
| - self.assertEqual(("url",) * len(expected_sentences), ret_urls) |
258 |
| - self.assertEqual(expected_sentences, ret_sentences) |
259 |
| - |
260 |
| - def test_emit_sentences_to_urls(self): |
261 |
| - counters, counter_inc_fn = _get_counters() |
262 |
| - urls = ["urlA", "urlB", "urlC", "urlD"] |
263 |
| - sentence = "test sentence." |
264 |
| - expected_urls = ("urlA", "urlD") |
265 |
| - results = c4_utils._emit_sentences_to_urls((sentence, urls), |
266 |
| - counter_inc_fn, |
267 |
| - skip_n=2) |
268 |
| - ret_urls, ret_sentences = zip(*results) |
269 |
| - self.assertEqual(expected_urls, ret_urls) |
270 |
| - self.assertEqual((sentence,) * 2, ret_sentences) |
271 |
| - self.assertEqual({"emitted-sentences-duplicate": 1}, dict(counters)) |
272 |
| - |
273 |
| - def test_remove_sentences_from_page(self): |
274 |
| - counters, counter_inc_fn = _get_counters() |
275 |
| - sentences_to_remove = [ |
276 |
| - ("this line has enough words and ends in punctuation, dr. roberts!",), |
277 |
| - ("sentence 1.", "sentence 2.", "sentence 3."), |
278 |
| - ("sentence 3.", "sentence 4."), # no match |
279 |
| - ("sentence 1.", "sentence 3.", "sentence 4."), # no match |
280 |
| - ("sentence 3.", "sentence 4.", "sentence 5."), # no match |
| 234 | + def test_remove_duplicate_text(self): |
| 235 | + import apache_beam.testing.util as beam_testing_util # pylint:disable=g-import-not-at-top |
| 236 | + beam = lazy_imports.apache_beam |
| 237 | + input_urls_and_text = [ |
| 238 | + ("url/1-0", |
| 239 | + "This is a duplicated line.\nThis is a unique line.\n" |
| 240 | + "This one comes first and so it stays."), |
| 241 | + ("url/2-1", |
| 242 | + "This is 2nd unique line.\nThis one comes second so it is removed " |
| 243 | + "even though the capitalizaiton is different.\n" |
| 244 | + "this is a Duplicated line. "), |
| 245 | + ("url/3-4", |
| 246 | + "This is a 3rd unique line.\nThis is a duplicated line.\n" |
| 247 | + "This one comes third and so it is removed. But the page stays " |
| 248 | + "because there are still 3 sentences remaining."), |
| 249 | + ("url/4-4", |
| 250 | + "This is a 4th unique line.\nThis is a duplicated line.\n" |
| 251 | + "This one comes third and so it is removed, and the page is too " |
| 252 | + "since there aren't enough sentences left."), |
281 | 253 | ]
|
282 |
| - text = ( |
283 |
| - EXPECTED_CLEAN_EN + "\nSentence 1. Sentence 2. Sentence 3. Sentence 4.") |
284 |
| - expected_features = { |
285 |
| - "text": ("\"Open Access. Powered by Scholars. " |
286 |
| - "Published by Universities.\"\nSentence 4."), |
287 |
| - "content-type": FAKE_CONTENT_TYPE, |
288 |
| - "content-length": FAKE_CONTENT_LENGTH, |
289 |
| - "timestamp": FAKE_TIMESTAMP |
290 |
| - } |
291 |
| - result = list( |
292 |
| - c4_utils._remove_sentences_from_text(("url", { |
293 |
| - "features": [{ |
294 |
| - "text": text, |
295 |
| - "content-type": FAKE_CONTENT_TYPE, |
296 |
| - "content-length": FAKE_CONTENT_LENGTH, |
297 |
| - "timestamp": FAKE_TIMESTAMP |
298 |
| - }], |
299 |
| - "sentences": sentences_to_remove |
300 |
| - }), |
301 |
| - max_window_size=3, |
302 |
| - counter_inc_fn=counter_inc_fn)) |
303 |
| - self.assertEqual([("url", expected_features)], result) |
304 |
| - self.assertEqual({"filtered-sentence-duplicate": 4}, dict(counters)) |
305 |
| - |
306 |
| - counters.clear() |
307 |
| - sentences_to_remove.append(("sentence 2.", "sentence 3.", "sentence 4.")) |
308 |
| - expected_features = { |
309 |
| - "text": |
310 |
| - ("\"Open Access. Powered by Scholars. Published by Universities.\"" |
311 |
| - ), |
312 |
| - "content-type": FAKE_CONTENT_TYPE, |
313 |
| - "content-length": FAKE_CONTENT_LENGTH, |
314 |
| - "timestamp": FAKE_TIMESTAMP |
315 |
| - } |
316 |
| - result = list( |
317 |
| - c4_utils._remove_sentences_from_text(("url", { |
318 |
| - "features": [{ |
319 |
| - "text": text, |
320 |
| - "content-type": FAKE_CONTENT_TYPE, |
321 |
| - "content-length": FAKE_CONTENT_LENGTH, |
322 |
| - "timestamp": FAKE_TIMESTAMP |
323 |
| - }], |
324 |
| - "sentences": sentences_to_remove |
325 |
| - }), |
326 |
| - counter_inc_fn=counter_inc_fn, |
327 |
| - max_window_size=3, |
328 |
| - min_num_sentences=3)) |
329 |
| - self.assertEqual([("url", expected_features)], result) |
330 |
| - self.assertEqual({"filtered-sentence-duplicate": 5}, dict(counters)) |
331 |
| - |
332 |
| - counters.clear() |
333 |
| - result = list( |
334 |
| - c4_utils._remove_sentences_from_text(("url", { |
335 |
| - "features": [{ |
336 |
| - "text": text, |
337 |
| - "content-type": FAKE_CONTENT_TYPE, |
338 |
| - "content-length": FAKE_CONTENT_LENGTH, |
339 |
| - "timestamp": FAKE_TIMESTAMP |
340 |
| - }], |
341 |
| - "sentences": sentences_to_remove |
342 |
| - }), |
343 |
| - counter_inc_fn=counter_inc_fn, |
344 |
| - max_window_size=3, |
345 |
| - min_num_sentences=4)) |
346 |
| - self.assertEqual([], result) |
347 |
| - self.assertEqual( |
348 |
| - { |
349 |
| - "filtered-sentence-duplicate": 5, |
350 |
| - "filtered-doc-toofewsentences": 1 |
351 |
| - }, dict(counters)) |
| 254 | + expected_urls_and_text = [ |
| 255 | + ("url/1-0", |
| 256 | + "This is a duplicated line.\nThis is a unique line.\n" |
| 257 | + "This one comes first and so it stays."), |
| 258 | + ("url/3-4", |
| 259 | + "This is a 3rd unique line.\n" |
| 260 | + "This one comes third and so it is removed. But the page stays " |
| 261 | + "because there are still 3 sentences remaining."), |
| 262 | + ] |
| 263 | + with beam.Pipeline() as pipeline: |
| 264 | + pages = pipeline | beam.Create([ |
| 265 | + (url, {"text": text}) for url, text in input_urls_and_text |
| 266 | + ]) |
| 267 | + deduped_pages = c4_utils.remove_duplicate_text(pages) |
| 268 | + beam_testing_util.assert_that( |
| 269 | + deduped_pages, |
| 270 | + beam_testing_util.equal_to([ |
| 271 | + (url, {"text": text}) for url, text in expected_urls_and_text |
| 272 | + ])) |
352 | 273 |
|
353 | 274 | def test_split_wet_file(self):
|
354 | 275 | if six.PY2:
|
|
0 commit comments