Skip to content

Commit b8bc927

Browse files
yaozhaogooglecopybara-github
authored andcommitted
update CNN/DailyMail to cased version.
PiperOrigin-RevId: 291217261
1 parent 6fc3078 commit b8bc927

File tree

2 files changed

+47
-36
lines changed

2 files changed

+47
-36
lines changed

tensorflow_datasets/summarization/cnn_dailymail.py

Lines changed: 42 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import tensorflow.compat.v2 as tf
2424
import tensorflow_datasets.public_api as tfds
2525

26-
2726
_DESCRIPTION = """\
2827
CNN/DailyMail non-anonymized summarization dataset.
2928
@@ -63,11 +62,16 @@
6362

6463
_DL_URLS = {
6564
# pylint: disable=line-too-long
66-
'cnn_stories': 'https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ',
67-
'dm_stories': 'https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs',
68-
'test_urls': 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_test.txt',
69-
'train_urls': 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_train.txt',
70-
'val_urls': 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_val.txt',
65+
'cnn_stories':
66+
'https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ',
67+
'dm_stories':
68+
'https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs',
69+
'test_urls':
70+
'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_test.txt',
71+
'train_urls':
72+
'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_train.txt',
73+
'val_urls':
74+
'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_val.txt',
7175
# pylint: enable=line-too-long
7276
}
7377

@@ -77,12 +81,14 @@
7781
tfds.core.Version('0.0.2', experiments={tfds.core.Experiment.S3: False}),
7882
# Same data as 0.0.2
7983
tfds.core.Version('1.0.0',
80-
'New split API (https://tensorflow.org/datasets/splits)')]
84+
'New split API (https://tensorflow.org/datasets/splits)'),
85+
# Having the model predict newline separators makes it easier to evaluate
86+
# using summary-level ROUGE.
87+
tfds.core.Version('2.0.0', 'Separate target sentences with newline.')
88+
]
8189

82-
# Having the model predict newline separators makes it easier to evaluate
83-
# using summary-level ROUGE.
84-
_DEFAULT_VERSION = tfds.core.Version('2.0.0',
85-
'Separate target sentences with newline.')
90+
# Using cased version.
91+
_DEFAULT_VERSION = tfds.core.Version('3.0.0', 'Using cased version.')
8692

8793

8894
class CnnDailymailConfig(tfds.core.BuilderConfig):
@@ -109,6 +115,7 @@ def __init__(self, text_encoder_config=None, **kwargs):
109115
def _get_url_hashes(path):
110116
"""Get hashes of urls in file."""
111117
urls = _read_text_file(path)
118+
112119
def url_hash(u):
113120
h = hashlib.sha1()
114121
try:
@@ -117,6 +124,7 @@ def url_hash(u):
117124
logging.error('Cannot hash url: %s', u)
118125
h.update(u)
119126
return h.hexdigest()
127+
120128
return {url_hash(u): True for u in urls}
121129

122130

@@ -158,8 +166,10 @@ def _subset_filenames(dl_paths, split):
158166
DM_SINGLE_CLOSE_QUOTE = u'\u2019' # unicode
159167
DM_DOUBLE_CLOSE_QUOTE = u'\u201d'
160168
# acceptable ways to end a sentence
161-
END_TOKENS = ['.', '!', '?', '...', "'", '`', '"',
162-
DM_SINGLE_CLOSE_QUOTE, DM_DOUBLE_CLOSE_QUOTE, ')']
169+
END_TOKENS = [
170+
'.', '!', '?', '...', "'", '`', '"', DM_SINGLE_CLOSE_QUOTE,
171+
DM_DOUBLE_CLOSE_QUOTE, ')'
172+
]
163173

164174

165175
def _read_text_file(text_file):
@@ -177,19 +187,22 @@ def _get_art_abs(story_file, tfds_version):
177187

178188
lines = _read_text_file(story_file)
179189

180-
# Lowercase everything
181-
lines = [line.lower() for line in lines]
190+
# The github code lowercase the text and we removed it in 3.0.0.
182191

183192
# Put periods on the ends of lines that are missing them
184193
# (this is a problem in the dataset because many image captions don't end in
185194
# periods; consequently they end up in the body of the article as run-on
186195
# sentences)
187196
def fix_missing_period(line):
188197
"""Adds a period to a line that is missing a period."""
189-
if '@highlight' in line: return line
190-
if not line: return line
191-
if line[-1] in END_TOKENS: return line
198+
if '@highlight' in line:
199+
return line
200+
if not line:
201+
return line
202+
if line[-1] in END_TOKENS:
203+
return line
192204
return line + ' .'
205+
193206
lines = [fix_missing_period(line) for line in lines]
194207

195208
# Separate out article and abstract sentences
@@ -247,10 +260,12 @@ def _info(self):
247260
builder=self,
248261
description=_DESCRIPTION,
249262
features=tfds.features.FeaturesDict({
250-
_ARTICLE: tfds.features.Text(
251-
encoder_config=self.builder_config.text_encoder_config),
252-
_HIGHLIGHTS: tfds.features.Text(
253-
encoder_config=self.builder_config.text_encoder_config),
263+
_ARTICLE:
264+
tfds.features.Text(
265+
encoder_config=self.builder_config.text_encoder_config),
266+
_HIGHLIGHTS:
267+
tfds.features.Text(
268+
encoder_config=self.builder_config.text_encoder_config),
254269
}),
255270
supervised_keys=(_ARTICLE, _HIGHLIGHTS),
256271
homepage='https://github.com/abisee/cnn-dailymail',
@@ -278,17 +293,16 @@ def _split_generators(self, dl_manager):
278293
name=tfds.Split.TRAIN,
279294
num_shards=100,
280295
gen_kwargs={'files': train_files}),
281-
282296
tfds.core.SplitGenerator(
283297
name=tfds.Split.VALIDATION,
284298
num_shards=10,
285-
gen_kwargs={'files': _subset_filenames(dl_paths,
286-
tfds.Split.VALIDATION)}),
299+
gen_kwargs={
300+
'files': _subset_filenames(dl_paths, tfds.Split.VALIDATION)
301+
}),
287302
tfds.core.SplitGenerator(
288303
name=tfds.Split.TEST,
289304
num_shards=10,
290-
gen_kwargs={'files': _subset_filenames(dl_paths,
291-
tfds.Split.TEST)})
305+
gen_kwargs={'files': _subset_filenames(dl_paths, tfds.Split.TEST)})
292306
]
293307

294308
def _generate_examples(self, files):
@@ -297,7 +311,4 @@ def _generate_examples(self, files):
297311
if not article or not highlights:
298312
continue
299313
fname = os.path.basename(p)
300-
yield fname, {
301-
_ARTICLE: article,
302-
_HIGHLIGHTS: highlights
303-
}
314+
yield fname, {_ARTICLE: article, _HIGHLIGHTS: highlights}

tensorflow_datasets/summarization/cnn_dailymail_test.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,18 +56,18 @@ def test_get_art_abs(self):
5656
with tempfile.NamedTemporaryFile(delete=True) as f:
5757
f.write(_STORY_FILE)
5858
f.flush()
59-
article, abstract = cnn_dailymail._get_art_abs(
60-
f.name, tfds.core.Version('1.0.0'))
61-
self.assertEqual('some article. this is some article text.', article)
59+
article, abstract = cnn_dailymail._get_art_abs(f.name,
60+
tfds.core.Version('1.0.0'))
61+
self.assertEqual('Some article. This is some article text.', article)
6262
# This is a bit weird, but the original code at
6363
# https://github.com/abisee/cnn-dailymail/ adds space before period
6464
# for abstracts and we retain this behavior.
65-
self.assertEqual('highlight text . highlight two . highlight three .',
65+
self.assertEqual('highlight text . Highlight two . highlight Three .',
6666
abstract)
6767

6868
article, abstract = cnn_dailymail._get_art_abs(f.name,
6969
tfds.core.Version('2.0.0'))
70-
self.assertEqual('highlight text .\nhighlight two .\nhighlight three .',
70+
self.assertEqual('highlight text .\nHighlight two .\nhighlight Three .',
7171
abstract)
7272

7373

0 commit comments

Comments
 (0)