Add idx feature to all GLUE tasks.

craffel · copybara-github · commit 43176bb20f28 · 2019-05-21T11:31:11.000-07:00
PiperOrigin-RevId: 249291066
diff --git a/tensorflow_datasets/text/glue.py b/tensorflow_datasets/text/glue.py
@@ -96,7 +96,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
   BUILDER_CONFIGS = [
       GlueConfig(
           name="cola",
-          version="0.0.1",
+          version="0.0.2",
           description="""\
             The Corpus of Linguistic Acceptability consists of English
             acceptability judgments drawn from books and journal articles on
@@ -117,7 +117,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
           url="https://nyu-mll.github.io/CoLA/"),
       GlueConfig(
           name="sst2",
-          version="0.0.1",
+          version="0.0.2",
           description="""\
             The Stanford Sentiment Treebank consists of sentences from movie reviews and
             human annotations of their sentiment. The task is to predict the sentiment of a
@@ -139,7 +139,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
           url="https://nlp.stanford.edu/sentiment/index.html"),
       GlueConfig(
           name="mrpc",
-          version="0.0.1",
+          version="0.0.2",
           description="""\
             The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of
             sentence pairs automatically extracted from online news sources, with human annotations
@@ -163,7 +163,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
       ),
       GlueConfig(
           name="qqp",
-          version="0.0.1",
+          version="0.0.2",
           description="""\
             The Quora Question Pairs2 dataset is a collection of question pairs from the
             community question-answering website Quora. The task is to determine whether a
@@ -188,7 +188,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
       ),
       GlueConfig(
           name="stsb",
-          version="0.0.1",
+          version="0.0.2",
           description="""\
             The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of
             sentence pairs drawn from news headlines, video and image captions, and natural
@@ -212,7 +212,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
           process_label=np.float32),
       GlueConfig(
           name="mnli",
-          version="0.0.1",
+          version="0.0.2",
           description="""\
             The Multi-Genre Natural Language Inference Corpusn is a crowdsourced
             collection of sentence pairs with textual entailment annotations. Given a premise sentence
@@ -258,7 +258,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
           train_shards=2),
       GlueConfig(
           name="qnli",
-          version="0.0.1",
+          version="0.0.2",
           description="""\
             The Stanford Question Answering Dataset is a question-answering
             dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn
@@ -287,7 +287,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
           url="https://rajpurkar.github.io/SQuAD-explorer/"),
       GlueConfig(
           name="rte",
-          version="0.0.1",
+          version="0.0.2",
           description="""\
             The Recognizing Textual Entailment (RTE) datasets come from a series of annual textual
             entailment challenges. We combine the data from RTE1 (Dagan et al., 2006), RTE2 (Bar Haim
@@ -339,7 +339,7 @@ class Glue(tfds.core.GeneratorBasedBuilder):
       ),
       GlueConfig(
           name="wnli",
-          version="0.0.1",
+          version="0.0.2",
           description="""\
             The Winograd Schema Challenge (Levesque et al., 2011) is a reading comprehension task
             in which a system must read a sentence with a pronoun and select the referent of that pronoun from
@@ -385,6 +385,7 @@ def _info(self):
           names=self.builder_config.label_classes)
     else:
       features["label"] = tf.float32
+    features["idx"] = tf.int32
     return tfds.core.DatasetInfo(
         builder=self,
         description=self.builder_config.description,
@@ -493,7 +494,7 @@ def _generate_examples(self, data_file, split, mrpc_files):
         if is_cola_non_test:
           reader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
 
-        for row in reader:
+        for n, row in enumerate(reader):
           if is_cola_non_test:
             row = {
                 "sentence": row[3],
@@ -504,6 +505,7 @@ def _generate_examples(self, data_file, split, mrpc_files):
               feat: row[col]
               for feat, col in six.iteritems(self.builder_config.text_features)
           }
+          example["idx"] = n
 
           if self.builder_config.label_column in row:
             label = row[self.builder_config.label_column]
@@ -526,11 +528,12 @@ def _generate_example_mrpc_files(self, mrpc_files, split):
     if split == "test":
       with tf.io.gfile.GFile(mrpc_files["test"]) as f:
         reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
-        for row in reader:
+        for n, row in enumerate(reader):
           yield {
               "sentence1": row["#1 String"],
               "sentence2": row["#2 String"],
               "label": -1,
+              "idx": n,
           }
     else:
       with tf.io.gfile.GFile(mrpc_files["dev_ids"]) as f:
@@ -541,11 +544,12 @@ def _generate_example_mrpc_files(self, mrpc_files, split):
         # the Quality key.
         f.seek(3)
         reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
-        for row in reader:
+        for n, row in enumerate(reader):
           is_row_in_dev = [row["#1 ID"], row["#2 ID"]] in dev_ids
           if is_row_in_dev == (split == "dev"):
             yield {
                 "sentence1": row["#1 String"],
                 "sentence2": row["#2 String"],
                 "label": int(row["Quality"]),
+                "idx": n,
             }