Skip to content

Commit 43176bb

Browse files
craffelcopybara-github
authored andcommitted
Add idx feature to all GLUE tasks.
PiperOrigin-RevId: 249291066
1 parent 7c9219d commit 43176bb

File tree

1 file changed

+16
-12
lines changed

1 file changed

+16
-12
lines changed

tensorflow_datasets/text/glue.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
9696
BUILDER_CONFIGS = [
9797
GlueConfig(
9898
name="cola",
99-
version="0.0.1",
99+
version="0.0.2",
100100
description="""\
101101
The Corpus of Linguistic Acceptability consists of English
102102
acceptability judgments drawn from books and journal articles on
@@ -117,7 +117,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
117117
url="https://nyu-mll.github.io/CoLA/"),
118118
GlueConfig(
119119
name="sst2",
120-
version="0.0.1",
120+
version="0.0.2",
121121
description="""\
122122
The Stanford Sentiment Treebank consists of sentences from movie reviews and
123123
human annotations of their sentiment. The task is to predict the sentiment of a
@@ -139,7 +139,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
139139
url="https://nlp.stanford.edu/sentiment/index.html"),
140140
GlueConfig(
141141
name="mrpc",
142-
version="0.0.1",
142+
version="0.0.2",
143143
description="""\
144144
The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of
145145
sentence pairs automatically extracted from online news sources, with human annotations
@@ -163,7 +163,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
163163
),
164164
GlueConfig(
165165
name="qqp",
166-
version="0.0.1",
166+
version="0.0.2",
167167
description="""\
168168
The Quora Question Pairs2 dataset is a collection of question pairs from the
169169
community question-answering website Quora. The task is to determine whether a
@@ -188,7 +188,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
188188
),
189189
GlueConfig(
190190
name="stsb",
191-
version="0.0.1",
191+
version="0.0.2",
192192
description="""\
193193
The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of
194194
sentence pairs drawn from news headlines, video and image captions, and natural
@@ -212,7 +212,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
212212
process_label=np.float32),
213213
GlueConfig(
214214
name="mnli",
215-
version="0.0.1",
215+
version="0.0.2",
216216
description="""\
217217
The Multi-Genre Natural Language Inference Corpusn is a crowdsourced
218218
collection of sentence pairs with textual entailment annotations. Given a premise sentence
@@ -258,7 +258,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
258258
train_shards=2),
259259
GlueConfig(
260260
name="qnli",
261-
version="0.0.1",
261+
version="0.0.2",
262262
description="""\
263263
The Stanford Question Answering Dataset is a question-answering
264264
dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn
@@ -287,7 +287,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
287287
url="https://rajpurkar.github.io/SQuAD-explorer/"),
288288
GlueConfig(
289289
name="rte",
290-
version="0.0.1",
290+
version="0.0.2",
291291
description="""\
292292
The Recognizing Textual Entailment (RTE) datasets come from a series of annual textual
293293
entailment challenges. We combine the data from RTE1 (Dagan et al., 2006), RTE2 (Bar Haim
@@ -339,7 +339,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
339339
),
340340
GlueConfig(
341341
name="wnli",
342-
version="0.0.1",
342+
version="0.0.2",
343343
description="""\
344344
The Winograd Schema Challenge (Levesque et al., 2011) is a reading comprehension task
345345
in which a system must read a sentence with a pronoun and select the referent of that pronoun from
@@ -385,6 +385,7 @@ def _info(self):
385385
names=self.builder_config.label_classes)
386386
else:
387387
features["label"] = tf.float32
388+
features["idx"] = tf.int32
388389
return tfds.core.DatasetInfo(
389390
builder=self,
390391
description=self.builder_config.description,
@@ -493,7 +494,7 @@ def _generate_examples(self, data_file, split, mrpc_files):
493494
if is_cola_non_test:
494495
reader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
495496

496-
for row in reader:
497+
for n, row in enumerate(reader):
497498
if is_cola_non_test:
498499
row = {
499500
"sentence": row[3],
@@ -504,6 +505,7 @@ def _generate_examples(self, data_file, split, mrpc_files):
504505
feat: row[col]
505506
for feat, col in six.iteritems(self.builder_config.text_features)
506507
}
508+
example["idx"] = n
507509

508510
if self.builder_config.label_column in row:
509511
label = row[self.builder_config.label_column]
@@ -526,11 +528,12 @@ def _generate_example_mrpc_files(self, mrpc_files, split):
526528
if split == "test":
527529
with tf.io.gfile.GFile(mrpc_files["test"]) as f:
528530
reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
529-
for row in reader:
531+
for n, row in enumerate(reader):
530532
yield {
531533
"sentence1": row["#1 String"],
532534
"sentence2": row["#2 String"],
533535
"label": -1,
536+
"idx": n,
534537
}
535538
else:
536539
with tf.io.gfile.GFile(mrpc_files["dev_ids"]) as f:
@@ -541,11 +544,12 @@ def _generate_example_mrpc_files(self, mrpc_files, split):
541544
# the Quality key.
542545
f.seek(3)
543546
reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
544-
for row in reader:
547+
for n, row in enumerate(reader):
545548
is_row_in_dev = [row["#1 ID"], row["#2 ID"]] in dev_ids
546549
if is_row_in_dev == (split == "dev"):
547550
yield {
548551
"sentence1": row["#1 String"],
549552
"sentence2": row["#2 String"],
550553
"label": int(row["Quality"]),
554+
"idx": n,
551555
}

0 commit comments

Comments
 (0)