@@ -69,7 +69,8 @@ class Gigaword(tfds.core.GeneratorBasedBuilder):
69
69
70
70
# 1.0.0 contains a bug that uses validation data as training data.
71
71
# 1.1.0 Update to the correct train, validation and test data.
72
- VERSION = tfds .core .Version ("1.1.0" )
72
+ # 1.2.0 Replace <unk> with <UNK> in train/val to be consistent with test.
73
+ VERSION = tfds .core .Version ("1.2.0" )
73
74
74
75
def _info (self ):
75
76
return tfds .core .DatasetInfo (
@@ -93,27 +94,36 @@ def _split_generators(self, dl_manager):
93
94
name = tfds .Split .TRAIN ,
94
95
gen_kwargs = {
95
96
"src_path" : pattern % ("train" , "src" ),
96
- "tgt_path" : pattern % ("train" , "tgt" )
97
+ "tgt_path" : pattern % ("train" , "tgt" ),
98
+ "replace_unk" : True ,
97
99
},
98
100
),
99
101
tfds .core .SplitGenerator (
100
102
name = tfds .Split .VALIDATION ,
101
103
gen_kwargs = {
102
104
"src_path" : pattern % ("dev" , "src" ),
103
- "tgt_path" : pattern % ("dev" , "tgt" )
105
+ "tgt_path" : pattern % ("dev" , "tgt" ),
106
+ "replace_unk" : True ,
104
107
},
105
108
),
106
109
tfds .core .SplitGenerator (
107
110
name = tfds .Split .TEST ,
108
111
gen_kwargs = {
109
112
"src_path" : pattern % ("test" , "src" ),
110
- "tgt_path" : pattern % ("test" , "tgt" )
113
+ "tgt_path" : pattern % ("test" , "tgt" ),
114
+ "replace_unk" : False ,
111
115
},
112
116
),
113
117
]
114
118
115
- def _generate_examples (self , src_path = None , tgt_path = None ):
119
+ def _generate_examples (self , src_path = None , tgt_path = None , replace_unk = None ):
116
120
"""Yields examples."""
117
121
with tf .io .gfile .GFile (src_path ) as f_d , tf .io .gfile .GFile (tgt_path ) as f_s :
118
122
for i , (doc_text , sum_text ) in enumerate (zip (f_d , f_s )):
119
- yield i , {_DOCUMENT : doc_text .strip (), _SUMMARY : sum_text .strip ()}
123
+ if replace_unk :
124
+ yield i , {
125
+ _DOCUMENT : doc_text .strip ().replace ("<unk>" , "UNK" ),
126
+ _SUMMARY : sum_text .strip ().replace ("<unk>" , "UNK" )
127
+ }
128
+ else :
129
+ yield i , {_DOCUMENT : doc_text .strip (), _SUMMARY : sum_text .strip ()}
0 commit comments