Skip to content

Commit 6db487c

Browse files
sharannarangcopybara-github
authored andcommitted
Add movie rationales dataset
PiperOrigin-RevId: 289948223
1 parent 079979f commit 6db487c

File tree

12 files changed

+167
-0
lines changed

12 files changed

+167
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Excellent acting, overall great movie.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Boring movie
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Lazy acting
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Excellent direction. I had a great time during the movie.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fun movie
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"annotation_id": "test_1.txt", "classification": "POS", "evidences":[[{"start_pos": 0, "end_pos": 7, "text": "Excellent acting"}]]}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{"annotation_id": "train_1.txt", "classification": "NEG", "evidences":[[{"start_pos": 0, "end_pos": 7, "text": "Boring movie"}]]}
2+
{"annotation_id": "train_2.txt", "classification": "NEG", "evidences":[[{"start_pos": 0, "end_pos": 7, "text": "Lazy acting"}]]}
3+
{"annotation_id": "train_3.txt", "classification": "POS", "evidences":[[{"start_pos": 0, "end_pos": 7, "text": "Excellent direction"}]]}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"annotation_id": "val_1.txt", "classification": "POS", "evidences":[[{"start_pos": 0, "end_pos": 7, "text": "Fun movie"}]]}

tensorflow_datasets/text/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from tensorflow_datasets.text.lm1b import Lm1b
2727
from tensorflow_datasets.text.lm1b import Lm1bConfig
2828
from tensorflow_datasets.text.math_dataset import MathDataset
29+
from tensorflow_datasets.text.movie_rationales import MovieRationales
2930
from tensorflow_datasets.text.multi_nli import MultiNLI
3031
from tensorflow_datasets.text.multi_nli_mismatch import MultiNLIMismatch
3132
from tensorflow_datasets.text.scan import Scan
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# coding=utf-8
2+
# Copyright 2019 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Movie reviews with human annotated rationales."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
import json
23+
import os
24+
import tensorflow as tf
25+
import tensorflow_datasets.public_api as tfds
26+
27+
_CITATION = """
28+
@unpublished{eraser2019,
29+
title = {ERASER: A Benchmark to Evaluate Rationalized NLP Models},
30+
author = {Jay DeYoung and Sarthak Jain and Nazneen Fatema Rajani and Eric Lehman and Caiming Xiong and Richard Socher and Byron C. Wallace}
31+
}
32+
@InProceedings{zaidan-eisner-piatko-2008:nips,
33+
author = {Omar F. Zaidan and Jason Eisner and Christine Piatko},
34+
title = {Machine Learning with Annotator Rationales to Reduce Annotation Cost},
35+
booktitle = {Proceedings of the NIPS*2008 Workshop on Cost Sensitive Learning},
36+
month = {December},
37+
year = {2008}
38+
}
39+
"""
40+
41+
_DESCRIPTION = """
42+
The movie rationale dataset contains human annotated rationales for movie
43+
reviews.
44+
"""
45+
46+
_DOWNLOAD_URL = 'http://www.eraserbenchmark.com/zipped/movies.tar.gz'
47+
48+
49+
class MovieRationales(tfds.core.GeneratorBasedBuilder):
50+
"""Movie reviews with human annotated rationales."""
51+
52+
VERSION = tfds.core.Version('0.1.0')
53+
54+
def _info(self):
55+
return tfds.core.DatasetInfo(
56+
builder=self,
57+
description=_DESCRIPTION,
58+
features=tfds.features.FeaturesDict({
59+
'review': tfds.features.Text(),
60+
'label': tfds.features.ClassLabel(names=['NEG', 'POS']),
61+
'evidences': tfds.features.Sequence(tfds.features.Text()),
62+
}),
63+
supervised_keys=None,
64+
homepage='http://www.cs.jhu.edu/~ozaidan/rationales/',
65+
citation=_CITATION,
66+
)
67+
68+
def _split_generators(self, dl_manager):
69+
"""Returns SplitGenerators."""
70+
dl_dir = dl_manager.download_and_extract(_DOWNLOAD_URL)
71+
data_dir = os.path.join(dl_dir, 'movies')
72+
73+
return [
74+
tfds.core.SplitGenerator(
75+
name=tfds.Split.TRAIN,
76+
gen_kwargs={
77+
'data_dir': data_dir,
78+
'filepath': os.path.join(data_dir, 'train.jsonl')
79+
},
80+
),
81+
tfds.core.SplitGenerator(
82+
name=tfds.Split.VALIDATION,
83+
gen_kwargs={
84+
'data_dir': data_dir,
85+
'filepath': os.path.join(data_dir, 'val.jsonl')
86+
},
87+
),
88+
tfds.core.SplitGenerator(
89+
name=tfds.Split.TEST,
90+
gen_kwargs={
91+
'data_dir': data_dir,
92+
'filepath': os.path.join(data_dir, 'test.jsonl')
93+
},
94+
),
95+
]
96+
97+
def _generate_examples(self, data_dir, filepath):
98+
"""Yields examples."""
99+
reviews_dir = os.path.join(data_dir, 'docs')
100+
101+
with tf.io.gfile.GFile(filepath) as f:
102+
for line in f:
103+
row = json.loads(line)
104+
doc_id = row['annotation_id']
105+
review_file = os.path.join(reviews_dir, doc_id)
106+
with tf.io.gfile.GFile(review_file) as f1:
107+
review_text = f1.read()
108+
109+
evidences = []
110+
for evidence in row['evidences']:
111+
for e in evidence:
112+
evidences.append(e['text'])
113+
114+
yield doc_id, {
115+
'review': review_text,
116+
'label': row['classification'],
117+
'evidences': evidences,
118+
}

0 commit comments

Comments
 (0)