@@ -96,7 +96,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
96
96
BUILDER_CONFIGS = [
97
97
GlueConfig (
98
98
name = "cola" ,
99
- version = "0.0.1 " ,
99
+ version = "0.0.2 " ,
100
100
description = """\
101
101
The Corpus of Linguistic Acceptability consists of English
102
102
acceptability judgments drawn from books and journal articles on
@@ -117,7 +117,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
117
117
url = "https://nyu-mll.github.io/CoLA/" ),
118
118
GlueConfig (
119
119
name = "sst2" ,
120
- version = "0.0.1 " ,
120
+ version = "0.0.2 " ,
121
121
description = """\
122
122
The Stanford Sentiment Treebank consists of sentences from movie reviews and
123
123
human annotations of their sentiment. The task is to predict the sentiment of a
@@ -139,7 +139,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
139
139
url = "https://nlp.stanford.edu/sentiment/index.html" ),
140
140
GlueConfig (
141
141
name = "mrpc" ,
142
- version = "0.0.1 " ,
142
+ version = "0.0.2 " ,
143
143
description = """\
144
144
The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of
145
145
sentence pairs automatically extracted from online news sources, with human annotations
@@ -163,7 +163,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
163
163
),
164
164
GlueConfig (
165
165
name = "qqp" ,
166
- version = "0.0.1 " ,
166
+ version = "0.0.2 " ,
167
167
description = """\
168
168
The Quora Question Pairs2 dataset is a collection of question pairs from the
169
169
community question-answering website Quora. The task is to determine whether a
@@ -188,7 +188,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
188
188
),
189
189
GlueConfig (
190
190
name = "stsb" ,
191
- version = "0.0.1 " ,
191
+ version = "0.0.2 " ,
192
192
description = """\
193
193
The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of
194
194
sentence pairs drawn from news headlines, video and image captions, and natural
@@ -212,7 +212,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
212
212
process_label = np .float32 ),
213
213
GlueConfig (
214
214
name = "mnli" ,
215
- version = "0.0.1 " ,
215
+ version = "0.0.2 " ,
216
216
description = """\
217
217
The Multi-Genre Natural Language Inference Corpusn is a crowdsourced
218
218
collection of sentence pairs with textual entailment annotations. Given a premise sentence
@@ -258,7 +258,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
258
258
train_shards = 2 ),
259
259
GlueConfig (
260
260
name = "qnli" ,
261
- version = "0.0.1 " ,
261
+ version = "0.0.2 " ,
262
262
description = """\
263
263
The Stanford Question Answering Dataset is a question-answering
264
264
dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn
@@ -287,7 +287,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
287
287
url = "https://rajpurkar.github.io/SQuAD-explorer/" ),
288
288
GlueConfig (
289
289
name = "rte" ,
290
- version = "0.0.1 " ,
290
+ version = "0.0.2 " ,
291
291
description = """\
292
292
The Recognizing Textual Entailment (RTE) datasets come from a series of annual textual
293
293
entailment challenges. We combine the data from RTE1 (Dagan et al., 2006), RTE2 (Bar Haim
@@ -339,7 +339,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
339
339
),
340
340
GlueConfig (
341
341
name = "wnli" ,
342
- version = "0.0.1 " ,
342
+ version = "0.0.2 " ,
343
343
description = """\
344
344
The Winograd Schema Challenge (Levesque et al., 2011) is a reading comprehension task
345
345
in which a system must read a sentence with a pronoun and select the referent of that pronoun from
@@ -385,6 +385,7 @@ def _info(self):
385
385
names = self .builder_config .label_classes )
386
386
else :
387
387
features ["label" ] = tf .float32
388
+ features ["idx" ] = tf .int32
388
389
return tfds .core .DatasetInfo (
389
390
builder = self ,
390
391
description = self .builder_config .description ,
@@ -493,7 +494,7 @@ def _generate_examples(self, data_file, split, mrpc_files):
493
494
if is_cola_non_test :
494
495
reader = csv .reader (f , delimiter = "\t " , quoting = csv .QUOTE_NONE )
495
496
496
- for row in reader :
497
+ for n , row in enumerate ( reader ) :
497
498
if is_cola_non_test :
498
499
row = {
499
500
"sentence" : row [3 ],
@@ -504,6 +505,7 @@ def _generate_examples(self, data_file, split, mrpc_files):
504
505
feat : row [col ]
505
506
for feat , col in six .iteritems (self .builder_config .text_features )
506
507
}
508
+ example ["idx" ] = n
507
509
508
510
if self .builder_config .label_column in row :
509
511
label = row [self .builder_config .label_column ]
@@ -526,11 +528,12 @@ def _generate_example_mrpc_files(self, mrpc_files, split):
526
528
if split == "test" :
527
529
with tf .io .gfile .GFile (mrpc_files ["test" ]) as f :
528
530
reader = csv .DictReader (f , delimiter = "\t " , quoting = csv .QUOTE_NONE )
529
- for row in reader :
531
+ for n , row in enumerate ( reader ) :
530
532
yield {
531
533
"sentence1" : row ["#1 String" ],
532
534
"sentence2" : row ["#2 String" ],
533
535
"label" : - 1 ,
536
+ "idx" : n ,
534
537
}
535
538
else :
536
539
with tf .io .gfile .GFile (mrpc_files ["dev_ids" ]) as f :
@@ -541,11 +544,12 @@ def _generate_example_mrpc_files(self, mrpc_files, split):
541
544
# the Quality key.
542
545
f .seek (3 )
543
546
reader = csv .DictReader (f , delimiter = "\t " , quoting = csv .QUOTE_NONE )
544
- for row in reader :
547
+ for n , row in enumerate ( reader ) :
545
548
is_row_in_dev = [row ["#1 ID" ], row ["#2 ID" ]] in dev_ids
546
549
if is_row_in_dev == (split == "dev" ):
547
550
yield {
548
551
"sentence1" : row ["#1 String" ],
549
552
"sentence2" : row ["#2 String" ],
550
553
"label" : int (row ["Quality" ]),
554
+ "idx" : n ,
551
555
}
0 commit comments