23
23
import tensorflow .compat .v2 as tf
24
24
import tensorflow_datasets .public_api as tfds
25
25
26
-
27
26
_DESCRIPTION = """\
28
27
CNN/DailyMail non-anonymized summarization dataset.
29
28
63
62
64
63
_DL_URLS = {
65
64
# pylint: disable=line-too-long
66
- 'cnn_stories' : 'https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ' ,
67
- 'dm_stories' : 'https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs' ,
68
- 'test_urls' : 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_test.txt' ,
69
- 'train_urls' : 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_train.txt' ,
70
- 'val_urls' : 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_val.txt' ,
65
+ 'cnn_stories' :
66
+ 'https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ' ,
67
+ 'dm_stories' :
68
+ 'https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs' ,
69
+ 'test_urls' :
70
+ 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_test.txt' ,
71
+ 'train_urls' :
72
+ 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_train.txt' ,
73
+ 'val_urls' :
74
+ 'https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_val.txt' ,
71
75
# pylint: enable=line-too-long
72
76
}
73
77
77
81
tfds .core .Version ('0.0.2' , experiments = {tfds .core .Experiment .S3 : False }),
78
82
# Same data as 0.0.2
79
83
tfds .core .Version ('1.0.0' ,
80
- 'New split API (https://tensorflow.org/datasets/splits)' )]
84
+ 'New split API (https://tensorflow.org/datasets/splits)' ),
85
+ # Having the model predict newline separators makes it easier to evaluate
86
+ # using summary-level ROUGE.
87
+ tfds .core .Version ('2.0.0' , 'Separate target sentences with newline.' )
88
+ ]
81
89
82
- # Having the model predict newline separators makes it easier to evaluate
83
- # using summary-level ROUGE.
84
- _DEFAULT_VERSION = tfds .core .Version ('2.0.0' ,
85
- 'Separate target sentences with newline.' )
90
+ # Using cased version.
91
+ _DEFAULT_VERSION = tfds .core .Version ('3.0.0' , 'Using cased version.' )
86
92
87
93
88
94
class CnnDailymailConfig (tfds .core .BuilderConfig ):
@@ -109,6 +115,7 @@ def __init__(self, text_encoder_config=None, **kwargs):
109
115
def _get_url_hashes (path ):
110
116
"""Get hashes of urls in file."""
111
117
urls = _read_text_file (path )
118
+
112
119
def url_hash (u ):
113
120
h = hashlib .sha1 ()
114
121
try :
@@ -117,6 +124,7 @@ def url_hash(u):
117
124
logging .error ('Cannot hash url: %s' , u )
118
125
h .update (u )
119
126
return h .hexdigest ()
127
+
120
128
return {url_hash (u ): True for u in urls }
121
129
122
130
@@ -158,8 +166,10 @@ def _subset_filenames(dl_paths, split):
158
166
DM_SINGLE_CLOSE_QUOTE = u'\u2019 ' # unicode
159
167
DM_DOUBLE_CLOSE_QUOTE = u'\u201d '
160
168
# acceptable ways to end a sentence
161
- END_TOKENS = ['.' , '!' , '?' , '...' , "'" , '`' , '"' ,
162
- DM_SINGLE_CLOSE_QUOTE , DM_DOUBLE_CLOSE_QUOTE , ')' ]
169
+ END_TOKENS = [
170
+ '.' , '!' , '?' , '...' , "'" , '`' , '"' , DM_SINGLE_CLOSE_QUOTE ,
171
+ DM_DOUBLE_CLOSE_QUOTE , ')'
172
+ ]
163
173
164
174
165
175
def _read_text_file (text_file ):
@@ -177,19 +187,22 @@ def _get_art_abs(story_file, tfds_version):
177
187
178
188
lines = _read_text_file (story_file )
179
189
180
- # Lowercase everything
181
- lines = [line .lower () for line in lines ]
190
+ # The github code lowercase the text and we removed it in 3.0.0.
182
191
183
192
# Put periods on the ends of lines that are missing them
184
193
# (this is a problem in the dataset because many image captions don't end in
185
194
# periods; consequently they end up in the body of the article as run-on
186
195
# sentences)
187
196
def fix_missing_period (line ):
188
197
"""Adds a period to a line that is missing a period."""
189
- if '@highlight' in line : return line
190
- if not line : return line
191
- if line [- 1 ] in END_TOKENS : return line
198
+ if '@highlight' in line :
199
+ return line
200
+ if not line :
201
+ return line
202
+ if line [- 1 ] in END_TOKENS :
203
+ return line
192
204
return line + ' .'
205
+
193
206
lines = [fix_missing_period (line ) for line in lines ]
194
207
195
208
# Separate out article and abstract sentences
@@ -247,10 +260,12 @@ def _info(self):
247
260
builder = self ,
248
261
description = _DESCRIPTION ,
249
262
features = tfds .features .FeaturesDict ({
250
- _ARTICLE : tfds .features .Text (
251
- encoder_config = self .builder_config .text_encoder_config ),
252
- _HIGHLIGHTS : tfds .features .Text (
253
- encoder_config = self .builder_config .text_encoder_config ),
263
+ _ARTICLE :
264
+ tfds .features .Text (
265
+ encoder_config = self .builder_config .text_encoder_config ),
266
+ _HIGHLIGHTS :
267
+ tfds .features .Text (
268
+ encoder_config = self .builder_config .text_encoder_config ),
254
269
}),
255
270
supervised_keys = (_ARTICLE , _HIGHLIGHTS ),
256
271
homepage = 'https://github.com/abisee/cnn-dailymail' ,
@@ -278,17 +293,16 @@ def _split_generators(self, dl_manager):
278
293
name = tfds .Split .TRAIN ,
279
294
num_shards = 100 ,
280
295
gen_kwargs = {'files' : train_files }),
281
-
282
296
tfds .core .SplitGenerator (
283
297
name = tfds .Split .VALIDATION ,
284
298
num_shards = 10 ,
285
- gen_kwargs = {'files' : _subset_filenames (dl_paths ,
286
- tfds .Split .VALIDATION )}),
299
+ gen_kwargs = {
300
+ 'files' : _subset_filenames (dl_paths , tfds .Split .VALIDATION )
301
+ }),
287
302
tfds .core .SplitGenerator (
288
303
name = tfds .Split .TEST ,
289
304
num_shards = 10 ,
290
- gen_kwargs = {'files' : _subset_filenames (dl_paths ,
291
- tfds .Split .TEST )})
305
+ gen_kwargs = {'files' : _subset_filenames (dl_paths , tfds .Split .TEST )})
292
306
]
293
307
294
308
def _generate_examples (self , files ):
@@ -297,7 +311,4 @@ def _generate_examples(self, files):
297
311
if not article or not highlights :
298
312
continue
299
313
fname = os .path .basename (p )
300
- yield fname , {
301
- _ARTICLE : article ,
302
- _HIGHLIGHTS : highlights
303
- }
314
+ yield fname , {_ARTICLE : article , _HIGHLIGHTS : highlights }
0 commit comments