Add movie rationales dataset

sharannarang · copybara-github · commit 6db487cb844b · 2020-01-15T15:23:48.000-08:00
PiperOrigin-RevId: 289948223
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/movie_rationales/movies/docs/test_1.txt b/tensorflow_datasets/testing/test_data/fake_examples/movie_rationales/movies/docs/test_1.txt
@@ -0,0 +1 @@
+Excellent acting, overall great movie.
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/movie_rationales/movies/docs/train_1.txt b/tensorflow_datasets/testing/test_data/fake_examples/movie_rationales/movies/docs/train_1.txt
@@ -0,0 +1 @@
+Boring movie
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/movie_rationales/movies/docs/train_2.txt b/tensorflow_datasets/testing/test_data/fake_examples/movie_rationales/movies/docs/train_2.txt
@@ -0,0 +1 @@
+Lazy acting
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/movie_rationales/movies/docs/train_3.txt b/tensorflow_datasets/testing/test_data/fake_examples/movie_rationales/movies/docs/train_3.txt
@@ -0,0 +1 @@
+Excellent direction. I had a great time during the movie.
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/movie_rationales/movies/docs/val_1.txt b/tensorflow_datasets/testing/test_data/fake_examples/movie_rationales/movies/docs/val_1.txt
@@ -0,0 +1 @@
+Fun movie
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/movie_rationales/movies/test.jsonl b/tensorflow_datasets/testing/test_data/fake_examples/movie_rationales/movies/test.jsonl
@@ -0,0 +1 @@
+{"annotation_id": "test_1.txt", "classification": "POS", "evidences":[[{"start_pos": 0, "end_pos": 7, "text": "Excellent acting"}]]}
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/movie_rationales/movies/train.jsonl b/tensorflow_datasets/testing/test_data/fake_examples/movie_rationales/movies/train.jsonl
@@ -0,0 +1,3 @@
+{"annotation_id": "train_1.txt", "classification": "NEG", "evidences":[[{"start_pos": 0, "end_pos": 7, "text": "Boring movie"}]]}
+{"annotation_id": "train_2.txt", "classification": "NEG", "evidences":[[{"start_pos": 0, "end_pos": 7, "text": "Lazy acting"}]]}
+{"annotation_id": "train_3.txt", "classification": "POS", "evidences":[[{"start_pos": 0, "end_pos": 7, "text": "Excellent direction"}]]}
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/movie_rationales/movies/val.jsonl b/tensorflow_datasets/testing/test_data/fake_examples/movie_rationales/movies/val.jsonl
@@ -0,0 +1 @@
+{"annotation_id": "val_1.txt", "classification": "POS", "evidences":[[{"start_pos": 0, "end_pos": 7, "text": "Fun movie"}]]}
diff --git a/tensorflow_datasets/text/__init__.py b/tensorflow_datasets/text/__init__.py
@@ -26,6 +26,7 @@
 from tensorflow_datasets.text.lm1b import Lm1b
 from tensorflow_datasets.text.lm1b import Lm1bConfig
 from tensorflow_datasets.text.math_dataset import MathDataset
+from tensorflow_datasets.text.movie_rationales import MovieRationales
 from tensorflow_datasets.text.multi_nli import MultiNLI
 from tensorflow_datasets.text.multi_nli_mismatch import MultiNLIMismatch
 from tensorflow_datasets.text.scan import Scan
diff --git a/tensorflow_datasets/text/movie_rationales.py b/tensorflow_datasets/text/movie_rationales.py
@@ -0,0 +1,118 @@
+# coding=utf-8
+# Copyright 2019 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Movie reviews with human annotated rationales."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+import tensorflow as tf
+import tensorflow_datasets.public_api as tfds
+
+_CITATION = """
+@unpublished{eraser2019,
+    title = {ERASER: A Benchmark to Evaluate Rationalized NLP Models},
+    author = {Jay DeYoung and Sarthak Jain and Nazneen Fatema Rajani and Eric Lehman and Caiming Xiong and Richard Socher and Byron C. Wallace}
+}
+@InProceedings{zaidan-eisner-piatko-2008:nips,
+  author    =  {Omar F. Zaidan  and  Jason Eisner  and  Christine Piatko},
+  title     =  {Machine Learning with Annotator Rationales to Reduce Annotation Cost},
+  booktitle =  {Proceedings of the NIPS*2008 Workshop on Cost Sensitive Learning},
+  month     =  {December},
+  year      =  {2008}
+}
+"""
+
+_DESCRIPTION = """
+The movie rationale dataset contains human annotated rationales for movie
+reviews.
+"""
+
+_DOWNLOAD_URL = 'http://www.eraserbenchmark.com/zipped/movies.tar.gz'
+
+
+class MovieRationales(tfds.core.GeneratorBasedBuilder):
+  """Movie reviews with human annotated rationales."""
+
+  VERSION = tfds.core.Version('0.1.0')
+
+  def _info(self):
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            'review': tfds.features.Text(),
+            'label': tfds.features.ClassLabel(names=['NEG', 'POS']),
+            'evidences': tfds.features.Sequence(tfds.features.Text()),
+        }),
+        supervised_keys=None,
+        homepage='http://www.cs.jhu.edu/~ozaidan/rationales/',
+        citation=_CITATION,
+    )
+
+  def _split_generators(self, dl_manager):
+    """Returns SplitGenerators."""
+    dl_dir = dl_manager.download_and_extract(_DOWNLOAD_URL)
+    data_dir = os.path.join(dl_dir, 'movies')
+
+    return [
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TRAIN,
+            gen_kwargs={
+                'data_dir': data_dir,
+                'filepath': os.path.join(data_dir, 'train.jsonl')
+            },
+        ),
+        tfds.core.SplitGenerator(
+            name=tfds.Split.VALIDATION,
+            gen_kwargs={
+                'data_dir': data_dir,
+                'filepath': os.path.join(data_dir, 'val.jsonl')
+            },
+        ),
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TEST,
+            gen_kwargs={
+                'data_dir': data_dir,
+                'filepath': os.path.join(data_dir, 'test.jsonl')
+            },
+        ),
+    ]
+
+  def _generate_examples(self, data_dir, filepath):
+    """Yields examples."""
+    reviews_dir = os.path.join(data_dir, 'docs')
+
+    with tf.io.gfile.GFile(filepath) as f:
+      for line in f:
+        row = json.loads(line)
+        doc_id = row['annotation_id']
+        review_file = os.path.join(reviews_dir, doc_id)
+        with tf.io.gfile.GFile(review_file) as f1:
+          review_text = f1.read()
+
+        evidences = []
+        for evidence in row['evidences']:
+          for e in evidence:
+            evidences.append(e['text'])
+
+        yield doc_id, {
+            'review': review_text,
+            'label': row['classification'],
+            'evidences': evidences,
+        }
diff --git a/tensorflow_datasets/text/movie_rationales_test.py b/tensorflow_datasets/text/movie_rationales_test.py
@@ -0,0 +1,37 @@
+# coding=utf-8
+# Copyright 2019 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test for movie rationales dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow_datasets import testing
+from tensorflow_datasets.text import movie_rationales
+
+
+class MovieRationalesTest(testing.DatasetBuilderTestCase):
+  DATASET_CLASS = movie_rationales.MovieRationales
+  SPLITS = {
+      "train": 3,  # Number of fake train example
+      "test": 1,  # Number of fake test example
+      "validation": 1,
+  }
+
+
+if __name__ == "__main__":
+  testing.test_main()
+
diff --git a/tensorflow_datasets/url_checksums/movie_rationales.txt b/tensorflow_datasets/url_checksums/movie_rationales.txt
@@ -0,0 +1 @@
+http://www.eraserbenchmark.com/zipped/movies.tar.gz 3899487 66e18d4e6c9df9e9f5544572b0bfe92a39673f74ecbfc3859b46cedb2f5b2dee

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Excellent direction. I had a great time during the movie.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"annotation_id": "test_1.txt", "classification": "POS", "evidences":[[{"start_pos": 0, "end_pos": 7, "text": "Excellent acting"}]]}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{"annotation_id": "train_1.txt", "classification": "NEG", "evidences":[[{"start_pos": 0, "end_pos": 7, "text": "Boring movie"}]]}`
	`2`	`+{"annotation_id": "train_2.txt", "classification": "NEG", "evidences":[[{"start_pos": 0, "end_pos": 7, "text": "Lazy acting"}]]}`
	`3`	`+{"annotation_id": "train_3.txt", "classification": "POS", "evidences":[[{"start_pos": 0, "end_pos": 7, "text": "Excellent direction"}]]}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"annotation_id": "val_1.txt", "classification": "POS", "evidences":[[{"start_pos": 0, "end_pos": 7, "text": "Fun movie"}]]}`