73
73
74
74
_HIGHLIGHTS = 'highlights'
75
75
_ARTICLE = 'article'
76
+ _SUPPORTED_VERSIONS = [
77
+ tfds .core .Version ('0.0.2' , experiments = {tfds .core .Experiment .S3 : False }),
78
+ # Same data as 0.0.2
79
+ tfds .core .Version ('1.0.0' ,
80
+ 'New split API (https://tensorflow.org/datasets/splits)' )]
81
+
82
+ # Having the model predict newline separators makes it easier to evaluate
83
+ # using summary-level ROUGE.
84
+ _DEFAULT_VERSION = tfds .core .Version ('2.0.0' ,
85
+ 'Separate target sentences with newline.' )
76
86
77
87
78
88
class CnnDailymailConfig (tfds .core .BuilderConfig ):
@@ -92,13 +102,8 @@ def __init__(self, text_encoder_config=None, **kwargs):
92
102
# 1.0.0: S3 (new shuffling, sharding and slicing mechanism).
93
103
# 0.0.2: Initial version.
94
104
super (CnnDailymailConfig , self ).__init__ (
95
- version = tfds .core .Version (
96
- '0.0.2' , experiments = {tfds .core .Experiment .S3 : False }),
97
- supported_versions = [
98
- tfds .core .Version (
99
- '1.0.0' ,
100
- 'New split API (https://tensorflow.org/datasets/splits)' ),
101
- ],
105
+ version = _DEFAULT_VERSION ,
106
+ supported_versions = _SUPPORTED_VERSIONS ,
102
107
** kwargs )
103
108
self .text_encoder_config = (
104
109
text_encoder_config or tfds .features .text .TextEncoderConfig ())
@@ -168,7 +173,7 @@ def _read_text_file(text_file):
168
173
return lines
169
174
170
175
171
- def _get_art_abs (story_file ):
176
+ def _get_art_abs (story_file , tfds_version ):
172
177
"""Get abstract (highlights) and article from a story file path."""
173
178
# Based on https://github.com/abisee/cnn-dailymail/blob/master/
174
179
# make_datafiles.py
@@ -207,16 +212,16 @@ def fix_missing_period(line):
207
212
# Make article into a single string
208
213
article = ' ' .join (article_lines )
209
214
210
- # Make abstract into a single string, putting <s> and </s> tags around
211
- # the sentences.
212
- abstract = ' ' .join (highlights )
215
+ if tfds_version >= '2.0.0' :
216
+ abstract = '\n ' .join (highlights )
217
+ else :
218
+ abstract = ' ' .join (highlights )
213
219
214
220
return article , abstract
215
221
216
222
217
223
class CnnDailymail (tfds .core .GeneratorBasedBuilder ):
218
224
"""CNN/DailyMail non-anonymized summarization dataset."""
219
- # 0.0.2 is like 0.0.1 but without special tokens <s> and </s>.
220
225
BUILDER_CONFIGS = [
221
226
CnnDailymailConfig (
222
227
name = 'plain_text' ,
@@ -291,7 +296,7 @@ def _split_generators(self, dl_manager):
291
296
292
297
def _generate_examples (self , files ):
293
298
for p in files :
294
- article , highlights = _get_art_abs (p )
299
+ article , highlights = _get_art_abs (p , self . version )
295
300
if not article or not highlights :
296
301
continue
297
302
fname = os .path .basename (p )
0 commit comments