Skip to content

Commit 00927c8

Browse files
sharannarangcopybara-github
authored andcommitted
Add e-SNLI dataest to TFDS.
PiperOrigin-RevId: 280722926
1 parent 805e861 commit 00927c8

File tree

9 files changed

+173
-0
lines changed

9 files changed

+173
-0
lines changed

docs/release_notes.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,5 @@
1515
from the [MimickNet paper](https://arxiv.org/abs/1908.05782)
1616
* Add Dmlab dataset from the
1717
[VTAB benchmark](https://arxiv.org/abs/1910.04867).
18+
* Add e-SNLI dataset from the paper
19+
[e-SNLI](http://papers.nips.cc/paper/8163-e-snli-natural-language-inference-with-natural-language-explanations.pdf).
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
pairID,gold_label,Sentence1,Sentence2,Explanation_1,WorkerId,Sentence1_marked_1,Sentence2_marked_1,Sentence1_Highlighted_1,Sentence2_Highlighted_1
2+
1d,entailment,A woman smiles at the child.,A woman is present.,A woman must be present to smile.,xyz,A woman *smiles* at the child.,A woman is *present*,{},"1,2,3"
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
pairID,gold_label,Sentence1,Sentence2,Explanation_1,WorkerId,Sentence1_marked_1,Sentence2_marked_1,Sentence1_Highlighted_1,Sentence2_Highlighted_1
2+
1t,neutral,A woman looks at the child.,A woman is smiling.,Smiling and looking at the child are independent.,xyz,A woman *smiles* at the child.,A woman is *present*,{},"1,2,3"
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
pairID,gold_label,Sentence1,Sentence2,Explanation_1,WorkerId,Sentence1_marked_1,Sentence2_marked_1,Sentence1_Highlighted_1,Sentence2_Highlighted_1
2+
1,neutral,A man runs down a mountain.,A man runs really fast,Running fast does not imply running down a mountain,abc,A man runs down a mountain.,A man runs *really fast*.,{}."1,2,3"
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
pairID,gold_label,Sentence1,Sentence2,Explanation_1,WorkerId,Sentence1_marked_1,Sentence2_marked_1,Sentence1_Highlighted_1,Sentence2_Highlighted_1
2+
2,contradiction,A woman drinks a coffee.,A woman drinks a beer.,A woman cannot drink both a coffee and a beer,xyz,A woman drinks a *coffee*,A woman drinks a *beer*,{},"1,2,3"

tensorflow_datasets/text/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
from tensorflow_datasets.text.c4 import C4
1919
from tensorflow_datasets.text.definite_pronoun_resolution import DefinitePronounResolution
20+
from tensorflow_datasets.text.esnli import Esnli
2021
from tensorflow_datasets.text.gap import Gap
2122
from tensorflow_datasets.text.glue import Glue
2223
from tensorflow_datasets.text.imdb import IMDBReviews

tensorflow_datasets/text/esnli.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# coding=utf-8
2+
# Copyright 2019 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""e-SNLI: Natural Language Inference with Natural Language Explanations."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
import csv
23+
import os
24+
25+
import tensorflow as tf
26+
import tensorflow_datasets.public_api as tfds
27+
28+
_CITATION = """
29+
@incollection{NIPS2018_8163,
30+
title = {e-SNLI: Natural Language Inference with Natural Language Explanations},
31+
author = {Camburu, Oana-Maria and Rockt\"{a}schel, Tim and Lukasiewicz, Thomas and Blunsom, Phil},
32+
booktitle = {Advances in Neural Information Processing Systems 31},
33+
editor = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett},
34+
pages = {9539--9549},
35+
year = {2018},
36+
publisher = {Curran Associates, Inc.},
37+
url = {http://papers.nips.cc/paper/8163-e-snli-natural-language-inference-with-natural-language-explanations.pdf}
38+
}
39+
"""
40+
41+
_DESCRIPTION = """
42+
The e-SNLI dataset extends the Stanford Natural Language Inference Dataset to
43+
include human-annotated natural language explanations of the entailment
44+
relations.
45+
"""
46+
_URL = 'https://raw.githubusercontent.com/OanaMariaCamburu/e-SNLI/master/dataset/'
47+
48+
49+
class Esnli(tfds.core.GeneratorBasedBuilder):
50+
"""e-SNLI: Natural Language Inference with Natural Language Explanations corpus."""
51+
52+
BUILDER_CONFIGS = [
53+
tfds.core.BuilderConfig(
54+
name='plain_text',
55+
version=tfds.core.Version('0.0.1'),
56+
description='Plain text import of e-SNLI',
57+
)
58+
]
59+
60+
def _info(self):
61+
return tfds.core.DatasetInfo(
62+
builder=self,
63+
description=_DESCRIPTION,
64+
features=tfds.features.FeaturesDict({
65+
'premise':
66+
tfds.features.Text(),
67+
'hypothesis':
68+
tfds.features.Text(),
69+
'label':
70+
tfds.features.ClassLabel(
71+
names=['entailment', 'neutral', 'contradiction']),
72+
'explanation':
73+
tfds.features.Text(),
74+
}),
75+
supervised_keys=None,
76+
homepage='https://github.com/OanaMariaCamburu/e-SNLI',
77+
citation=_CITATION,
78+
)
79+
80+
def _split_generators(self, dl_manager):
81+
"""Returns SplitGenerators."""
82+
83+
files = dl_manager.download_and_extract({
84+
'train': [os.path.join(_URL, 'esnli_train_1.csv'),
85+
os.path.join(_URL, 'esnli_train_2.csv')],
86+
'validation': [os.path.join(_URL, 'esnli_dev.csv')],
87+
'test': [os.path.join(_URL, 'esnli_test.csv')]
88+
})
89+
90+
return [
91+
tfds.core.SplitGenerator(
92+
name=tfds.Split.TRAIN,
93+
gen_kwargs={'files': files['train']},
94+
),
95+
tfds.core.SplitGenerator(
96+
name=tfds.Split.VALIDATION,
97+
gen_kwargs={'files': files['validation']},
98+
),
99+
tfds.core.SplitGenerator(
100+
name=tfds.Split.TEST,
101+
gen_kwargs={'files': files['test']},
102+
),
103+
]
104+
105+
def _generate_examples(self, files):
106+
"""Yields examples."""
107+
for filepath in files:
108+
with tf.io.gfile.GFile(filepath) as f:
109+
reader = csv.DictReader(f)
110+
for _, row in enumerate(reader):
111+
yield row['pairID'], {
112+
'premise': row['Sentence1'],
113+
'hypothesis': row['Sentence2'],
114+
'label': row['gold_label'],
115+
'explanation': row['Explanation_1']
116+
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# coding=utf-8
2+
# Copyright 2019 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Test for e-SNLI dataset module."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
from tensorflow_datasets import testing
23+
from tensorflow_datasets.text import esnli
24+
25+
26+
class EsnliTest(testing.DatasetBuilderTestCase):
27+
DATASET_CLASS = esnli.Esnli
28+
SPLITS = {
29+
"train": 2, # Number of fake train examples
30+
"test": 1, # Number of fake test examples
31+
"validation": 1, # Number of fake validation examples
32+
}
33+
34+
DL_EXTRACT_RESULT = {
35+
"train": ["esnli_train_1.csv", "esnli_train_2.csv"],
36+
"test": ["esnli_test.csv"],
37+
"validation": ["esnli_dev.csv"],
38+
}
39+
40+
41+
if __name__ == "__main__":
42+
testing.test_main()
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
https://raw.githubusercontent.com/OanaMariaCamburu/e-SNLI/master/dataset/esnli_dev.csv 7501310 d7a656d196eea18fd827d6f5486b00f5d4cf469dbf13a252cc211495a186bcb4
2+
https://raw.githubusercontent.com/OanaMariaCamburu/e-SNLI/master/dataset/esnli_test.csv 7438107 9269aa8075dbfed0fa8a5012eb2ac9c18a92be139671f05df897f69092592927
3+
https://raw.githubusercontent.com/OanaMariaCamburu/e-SNLI/master/dataset/esnli_train_1.csv 90169741 7311c7bc16ad9f6a9adcd116a62ef991e1803dc0d71e253ce66b975c2aba8ee5
4+
https://raw.githubusercontent.com/OanaMariaCamburu/e-SNLI/master/dataset/esnli_train_2.csv 99406852 c8827fa2ba1ef5891f077d6fafbfc2fdb6cab1271968b74257e08d3b2b3cbacc

0 commit comments

Comments
 (0)