update CNN/DailyMail to cased version.

yaozhaogoogle · copybara-github · commit b8bc9278ac5d · 2020-01-23T12:20:45.000-08:00
PiperOrigin-RevId: 291217261
diff --git a/tensorflow_datasets/summarization/cnn_dailymail.py b/tensorflow_datasets/summarization/cnn_dailymail.py
@@ -23,7 +23,6 @@
 import tensorflow.compat.v2 as tf
 import tensorflow_datasets.public_api as tfds
 
-
 _DESCRIPTION = """\
 CNN/DailyMail non-anonymized summarization dataset.
 
@@ -63,11 +62,16 @@
 
 _DL_URLS = {
     # pylint: disable=line-too-long
-    'cnn_stories': 'https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ',
-    'dm_stories': 'https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs',
-    'test_urls': 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_test.txt',
-    'train_urls': 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_train.txt',
-    'val_urls': 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_val.txt',
+    'cnn_stories':
+        'https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ',
+    'dm_stories':
+        'https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs',
+    'test_urls':
+        'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_test.txt',
+    'train_urls':
+        'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_train.txt',
+    'val_urls':
+        'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_val.txt',
     # pylint: enable=line-too-long
 }
 
@@ -77,12 +81,14 @@
     tfds.core.Version('0.0.2', experiments={tfds.core.Experiment.S3: False}),
     # Same data as 0.0.2
     tfds.core.Version('1.0.0',
-                      'New split API (https://tensorflow.org/datasets/splits)')]
+                      'New split API (https://tensorflow.org/datasets/splits)'),
+    # Having the model predict newline separators makes it easier to evaluate
+    # using summary-level ROUGE.
+    tfds.core.Version('2.0.0', 'Separate target sentences with newline.')
+]
 
-# Having the model predict newline separators makes it easier to evaluate
-# using summary-level ROUGE.
-_DEFAULT_VERSION = tfds.core.Version('2.0.0',
-                                     'Separate target sentences with newline.')
+# Using cased version.
+_DEFAULT_VERSION = tfds.core.Version('3.0.0', 'Using cased version.')
 
 
 class CnnDailymailConfig(tfds.core.BuilderConfig):
@@ -109,6 +115,7 @@ def __init__(self, text_encoder_config=None, **kwargs):
 def _get_url_hashes(path):
   """Get hashes of urls in file."""
   urls = _read_text_file(path)
+
   def url_hash(u):
     h = hashlib.sha1()
     try:
@@ -117,6 +124,7 @@ def url_hash(u):
       logging.error('Cannot hash url: %s', u)
     h.update(u)
     return h.hexdigest()
+
   return {url_hash(u): True for u in urls}
 
 
@@ -158,8 +166,10 @@ def _subset_filenames(dl_paths, split):
 DM_SINGLE_CLOSE_QUOTE = u'\u2019'  # unicode
 DM_DOUBLE_CLOSE_QUOTE = u'\u201d'
 # acceptable ways to end a sentence
-END_TOKENS = ['.', '!', '?', '...', "'", '`', '"',
-              DM_SINGLE_CLOSE_QUOTE, DM_DOUBLE_CLOSE_QUOTE, ')']
+END_TOKENS = [
+    '.', '!', '?', '...', "'", '`', '"', DM_SINGLE_CLOSE_QUOTE,
+    DM_DOUBLE_CLOSE_QUOTE, ')'
+]
 
 
 def _read_text_file(text_file):
@@ -177,19 +187,22 @@ def _get_art_abs(story_file, tfds_version):
 
   lines = _read_text_file(story_file)
 
-  # Lowercase everything
-  lines = [line.lower() for line in lines]
+  # The github code lowercase the text and we removed it in 3.0.0.
 
   # Put periods on the ends of lines that are missing them
   # (this is a problem in the dataset because many image captions don't end in
   # periods; consequently they end up in the body of the article as run-on
   # sentences)
   def fix_missing_period(line):
     """Adds a period to a line that is missing a period."""
-    if '@highlight' in line: return line
-    if not line: return line
-    if line[-1] in END_TOKENS: return line
+    if '@highlight' in line:
+      return line
+    if not line:
+      return line
+    if line[-1] in END_TOKENS:
+      return line
     return line + ' .'
+
   lines = [fix_missing_period(line) for line in lines]
 
   # Separate out article and abstract sentences
@@ -247,10 +260,12 @@ def _info(self):
         builder=self,
         description=_DESCRIPTION,
         features=tfds.features.FeaturesDict({
-            _ARTICLE: tfds.features.Text(
-                encoder_config=self.builder_config.text_encoder_config),
-            _HIGHLIGHTS: tfds.features.Text(
-                encoder_config=self.builder_config.text_encoder_config),
+            _ARTICLE:
+                tfds.features.Text(
+                    encoder_config=self.builder_config.text_encoder_config),
+            _HIGHLIGHTS:
+                tfds.features.Text(
+                    encoder_config=self.builder_config.text_encoder_config),
         }),
         supervised_keys=(_ARTICLE, _HIGHLIGHTS),
         homepage='https://github.com/abisee/cnn-dailymail',
@@ -278,17 +293,16 @@ def _split_generators(self, dl_manager):
             name=tfds.Split.TRAIN,
             num_shards=100,
             gen_kwargs={'files': train_files}),
-
         tfds.core.SplitGenerator(
             name=tfds.Split.VALIDATION,
             num_shards=10,
-            gen_kwargs={'files': _subset_filenames(dl_paths,
-                                                   tfds.Split.VALIDATION)}),
+            gen_kwargs={
+                'files': _subset_filenames(dl_paths, tfds.Split.VALIDATION)
+            }),
         tfds.core.SplitGenerator(
             name=tfds.Split.TEST,
             num_shards=10,
-            gen_kwargs={'files': _subset_filenames(dl_paths,
-                                                   tfds.Split.TEST)})
+            gen_kwargs={'files': _subset_filenames(dl_paths, tfds.Split.TEST)})
     ]
 
   def _generate_examples(self, files):
@@ -297,7 +311,4 @@ def _generate_examples(self, files):
       if not article or not highlights:
         continue
       fname = os.path.basename(p)
-      yield fname, {
-          _ARTICLE: article,
-          _HIGHLIGHTS: highlights
-      }
+      yield fname, {_ARTICLE: article, _HIGHLIGHTS: highlights}
diff --git a/tensorflow_datasets/summarization/cnn_dailymail_test.py b/tensorflow_datasets/summarization/cnn_dailymail_test.py
@@ -56,18 +56,18 @@ def test_get_art_abs(self):
     with tempfile.NamedTemporaryFile(delete=True) as f:
       f.write(_STORY_FILE)
       f.flush()
-      article, abstract = cnn_dailymail._get_art_abs(
-          f.name, tfds.core.Version('1.0.0'))
-      self.assertEqual('some article. this is some article text.', article)
+      article, abstract = cnn_dailymail._get_art_abs(f.name,
+                                                     tfds.core.Version('1.0.0'))
+      self.assertEqual('Some article. This is some article text.', article)
       # This is a bit weird, but the original code at
       # https://github.com/abisee/cnn-dailymail/ adds space before period
       # for abstracts and we retain this behavior.
-      self.assertEqual('highlight text . highlight two . highlight three .',
+      self.assertEqual('highlight text . Highlight two . highlight Three .',
                        abstract)
 
       article, abstract = cnn_dailymail._get_art_abs(f.name,
                                                      tfds.core.Version('2.0.0'))
-      self.assertEqual('highlight text .\nhighlight two .\nhighlight three .',
+      self.assertEqual('highlight text .\nHighlight two .\nhighlight Three .',
                        abstract)