Add e-SNLI dataest to TFDS.

sharannarang · copybara-github · commit 00927c8eaa7f · 2019-11-15T13:56:24.000-08:00
PiperOrigin-RevId: 280722926
diff --git a/docs/release_notes.md b/docs/release_notes.md
@@ -15,3 +15,5 @@
     from the [MimickNet paper](https://arxiv.org/abs/1908.05782)
 *   Add Dmlab dataset from the
     [VTAB benchmark](https://arxiv.org/abs/1910.04867).
+*   Add e-SNLI dataset from the paper
+    [e-SNLI](http://papers.nips.cc/paper/8163-e-snli-natural-language-inference-with-natural-language-explanations.pdf).
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/esnli/esnli_dev.csv b/tensorflow_datasets/testing/test_data/fake_examples/esnli/esnli_dev.csv
@@ -0,0 +1,2 @@
+pairID,gold_label,Sentence1,Sentence2,Explanation_1,WorkerId,Sentence1_marked_1,Sentence2_marked_1,Sentence1_Highlighted_1,Sentence2_Highlighted_1
+1d,entailment,A woman smiles at the child.,A woman is present.,A woman must be present to smile.,xyz,A woman *smiles* at the child.,A woman is *present*,{},"1,2,3"
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/esnli/esnli_test.csv b/tensorflow_datasets/testing/test_data/fake_examples/esnli/esnli_test.csv
@@ -0,0 +1,2 @@
+pairID,gold_label,Sentence1,Sentence2,Explanation_1,WorkerId,Sentence1_marked_1,Sentence2_marked_1,Sentence1_Highlighted_1,Sentence2_Highlighted_1
+1t,neutral,A woman looks at the child.,A woman is smiling.,Smiling and looking at the child are independent.,xyz,A woman *smiles* at the child.,A woman is *present*,{},"1,2,3"
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/esnli/esnli_train_1.csv b/tensorflow_datasets/testing/test_data/fake_examples/esnli/esnli_train_1.csv
@@ -0,0 +1,2 @@
+pairID,gold_label,Sentence1,Sentence2,Explanation_1,WorkerId,Sentence1_marked_1,Sentence2_marked_1,Sentence1_Highlighted_1,Sentence2_Highlighted_1
+1,neutral,A man runs down a mountain.,A man runs really fast,Running fast does not imply running down a mountain,abc,A man runs down a mountain.,A man runs *really fast*.,{}."1,2,3"
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/esnli/esnli_train_2.csv b/tensorflow_datasets/testing/test_data/fake_examples/esnli/esnli_train_2.csv
@@ -0,0 +1,2 @@
+pairID,gold_label,Sentence1,Sentence2,Explanation_1,WorkerId,Sentence1_marked_1,Sentence2_marked_1,Sentence1_Highlighted_1,Sentence2_Highlighted_1
+2,contradiction,A woman drinks a coffee.,A woman drinks a beer.,A woman cannot drink both a coffee and a beer,xyz,A woman drinks a *coffee*,A woman drinks a *beer*,{},"1,2,3"
diff --git a/tensorflow_datasets/text/__init__.py b/tensorflow_datasets/text/__init__.py
@@ -17,6 +17,7 @@
 
 from tensorflow_datasets.text.c4 import C4
 from tensorflow_datasets.text.definite_pronoun_resolution import DefinitePronounResolution
+from tensorflow_datasets.text.esnli import Esnli
 from tensorflow_datasets.text.gap import Gap
 from tensorflow_datasets.text.glue import Glue
 from tensorflow_datasets.text.imdb import IMDBReviews
diff --git a/tensorflow_datasets/text/esnli.py b/tensorflow_datasets/text/esnli.py
@@ -0,0 +1,116 @@
+# coding=utf-8
+# Copyright 2019 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""e-SNLI: Natural Language Inference with Natural Language Explanations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import csv
+import os
+
+import tensorflow as tf
+import tensorflow_datasets.public_api as tfds
+
+_CITATION = """
+@incollection{NIPS2018_8163,
+title = {e-SNLI: Natural Language Inference with Natural Language Explanations},
+author = {Camburu, Oana-Maria and Rockt\"{a}schel, Tim and Lukasiewicz, Thomas and Blunsom, Phil},
+booktitle = {Advances in Neural Information Processing Systems 31},
+editor = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett},
+pages = {9539--9549},
+year = {2018},
+publisher = {Curran Associates, Inc.},
+url = {http://papers.nips.cc/paper/8163-e-snli-natural-language-inference-with-natural-language-explanations.pdf}
+}
+"""
+
+_DESCRIPTION = """
+The e-SNLI dataset extends the Stanford Natural Language Inference Dataset to
+include human-annotated natural language explanations of the entailment
+relations.
+"""
+_URL = 'https://raw.githubusercontent.com/OanaMariaCamburu/e-SNLI/master/dataset/'
+
+
+class Esnli(tfds.core.GeneratorBasedBuilder):
+  """e-SNLI: Natural Language Inference with Natural Language Explanations corpus."""
+
+  BUILDER_CONFIGS = [
+      tfds.core.BuilderConfig(
+          name='plain_text',
+          version=tfds.core.Version('0.0.1'),
+          description='Plain text import of e-SNLI',
+      )
+  ]
+
+  def _info(self):
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            'premise':
+                tfds.features.Text(),
+            'hypothesis':
+                tfds.features.Text(),
+            'label':
+                tfds.features.ClassLabel(
+                    names=['entailment', 'neutral', 'contradiction']),
+            'explanation':
+                tfds.features.Text(),
+        }),
+        supervised_keys=None,
+        homepage='https://github.com/OanaMariaCamburu/e-SNLI',
+        citation=_CITATION,
+    )
+
+  def _split_generators(self, dl_manager):
+    """Returns SplitGenerators."""
+
+    files = dl_manager.download_and_extract({
+        'train': [os.path.join(_URL, 'esnli_train_1.csv'),
+                  os.path.join(_URL, 'esnli_train_2.csv')],
+        'validation': [os.path.join(_URL, 'esnli_dev.csv')],
+        'test': [os.path.join(_URL, 'esnli_test.csv')]
+    })
+
+    return [
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TRAIN,
+            gen_kwargs={'files': files['train']},
+        ),
+        tfds.core.SplitGenerator(
+            name=tfds.Split.VALIDATION,
+            gen_kwargs={'files': files['validation']},
+        ),
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TEST,
+            gen_kwargs={'files': files['test']},
+        ),
+    ]
+
+  def _generate_examples(self, files):
+    """Yields examples."""
+    for filepath in files:
+      with tf.io.gfile.GFile(filepath) as f:
+        reader = csv.DictReader(f)
+        for _, row in enumerate(reader):
+          yield row['pairID'], {
+              'premise': row['Sentence1'],
+              'hypothesis': row['Sentence2'],
+              'label': row['gold_label'],
+              'explanation': row['Explanation_1']
+          }
diff --git a/tensorflow_datasets/text/esnli_test.py b/tensorflow_datasets/text/esnli_test.py
@@ -0,0 +1,42 @@
+# coding=utf-8
+# Copyright 2019 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test for e-SNLI dataset module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow_datasets import testing
+from tensorflow_datasets.text import esnli
+
+
+class EsnliTest(testing.DatasetBuilderTestCase):
+  DATASET_CLASS = esnli.Esnli
+  SPLITS = {
+      "train": 2,  # Number of fake train examples
+      "test": 1,  # Number of fake test examples
+      "validation": 1,  # Number of fake validation examples
+  }
+
+  DL_EXTRACT_RESULT = {
+      "train": ["esnli_train_1.csv", "esnli_train_2.csv"],
+      "test": ["esnli_test.csv"],
+      "validation": ["esnli_dev.csv"],
+  }
+
+
+if __name__ == "__main__":
+  testing.test_main()
diff --git a/tensorflow_datasets/url_checksums/esnli.txt b/tensorflow_datasets/url_checksums/esnli.txt
@@ -0,0 +1,4 @@
+https://raw.githubusercontent.com/OanaMariaCamburu/e-SNLI/master/dataset/esnli_dev.csv 7501310 d7a656d196eea18fd827d6f5486b00f5d4cf469dbf13a252cc211495a186bcb4
+https://raw.githubusercontent.com/OanaMariaCamburu/e-SNLI/master/dataset/esnli_test.csv 7438107 9269aa8075dbfed0fa8a5012eb2ac9c18a92be139671f05df897f69092592927
+https://raw.githubusercontent.com/OanaMariaCamburu/e-SNLI/master/dataset/esnli_train_1.csv 90169741 7311c7bc16ad9f6a9adcd116a62ef991e1803dc0d71e253ce66b975c2aba8ee5
+https://raw.githubusercontent.com/OanaMariaCamburu/e-SNLI/master/dataset/esnli_train_2.csv 99406852 c8827fa2ba1ef5891f077d6fafbfc2fdb6cab1271968b74257e08d3b2b3cbacc

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+pairID,gold_label,Sentence1,Sentence2,Explanation_1,WorkerId,Sentence1_marked_1,Sentence2_marked_1,Sentence1_Highlighted_1,Sentence2_Highlighted_1`
	`2`	`+1d,entailment,A woman smiles at the child.,A woman is present.,A woman must be present to smile.,xyz,A woman smiles at the child.,A woman is present,{},"1,2,3"`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+pairID,gold_label,Sentence1,Sentence2,Explanation_1,WorkerId,Sentence1_marked_1,Sentence2_marked_1,Sentence1_Highlighted_1,Sentence2_Highlighted_1`
	`2`	`+1t,neutral,A woman looks at the child.,A woman is smiling.,Smiling and looking at the child are independent.,xyz,A woman smiles at the child.,A woman is present,{},"1,2,3"`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+pairID,gold_label,Sentence1,Sentence2,Explanation_1,WorkerId,Sentence1_marked_1,Sentence2_marked_1,Sentence1_Highlighted_1,Sentence2_Highlighted_1`
	`2`	`+1,neutral,A man runs down a mountain.,A man runs really fast,Running fast does not imply running down a mountain,abc,A man runs down a mountain.,A man runs really fast.,{}."1,2,3"`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+pairID,gold_label,Sentence1,Sentence2,Explanation_1,WorkerId,Sentence1_marked_1,Sentence2_marked_1,Sentence1_Highlighted_1,Sentence2_Highlighted_1`
	`2`	`+2,contradiction,A woman drinks a coffee.,A woman drinks a beer.,A woman cannot drink both a coffee and a beer,xyz,A woman drinks a coffee,A woman drinks a beer,{},"1,2,3"`