Skip to content

Commit 65fc442

Browse files
author
Alex-Fabbri
committed
Added Opinosis data
1 parent 828684e commit 65fc442

File tree

12 files changed

+123
-0
lines changed

12 files changed

+123
-0
lines changed

docs/release_notes.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@
1515
from the [MimickNet paper](https://arxiv.org/abs/1908.05782)
1616
* Add Dmlab dataset from the
1717
[VTAB benchmark](https://arxiv.org/abs/1910.04867).
18+
* Add [Opinosis dataset](https://www.aclweb.org/anthology/C10-1039.pdf).

tensorflow_datasets/summarization/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from tensorflow_datasets.summarization.gigaword import Gigaword
2424
from tensorflow_datasets.summarization.multi_news import MultiNews
2525
from tensorflow_datasets.summarization.newsroom import Newsroom
26+
from tensorflow_datasets.summarization.opinosis import Opinosis
2627
from tensorflow_datasets.summarization.reddit_tifu import RedditTifu
2728
from tensorflow_datasets.summarization.scientific_papers import ScientificPapers
2829
from tensorflow_datasets.summarization.wikihow import Wikihow
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
"""Opinosis Opinion Dataset"""
2+
3+
from __future__ import absolute_import
4+
from __future__ import division
5+
from __future__ import print_function
6+
7+
import os
8+
9+
import tensorflow as tf
10+
import tensorflow_datasets.public_api as tfds
11+
12+
_CITATION = """
13+
@inproceedings{ganesan2010opinosis,
14+
title={Opinosis: a graph-based approach to abstractive summarization of highly redundant opinions},
15+
author={Ganesan, Kavita and Zhai, ChengXiang and Han, Jiawei},
16+
booktitle={Proceedings of the 23rd International Conference on Computational Linguistics},
17+
pages={340--348},
18+
year={2010},
19+
organization={Association for Computational Linguistics}
20+
}
21+
"""
22+
23+
_DESCRIPTION = """
24+
The Opinosis Opinion Dataset consists of sentences extracted from reviews for 51 topics.
25+
Topics and opinions are obtained from Tripadvisor, Edmunds.com and Amazon.com.
26+
"""
27+
28+
_URL = "https://github.com/kavgan/opinosis-summarization/raw/master/OpinosisDataset1.0_0.zip"
29+
30+
_REVIEW_SENTS = 'review_sents'
31+
_SUMMARIES = 'summaries'
32+
33+
class Opinosis(tfds.core.GeneratorBasedBuilder):
34+
"""Opinosis Opinion Dataset """
35+
36+
VERSION = tfds.core.Version('1.0.0')
37+
38+
def _info(self):
39+
return tfds.core.DatasetInfo(
40+
builder=self,
41+
description=_DESCRIPTION,
42+
features=tfds.features.FeaturesDict({
43+
_REVIEW_SENTS: tfds.features.Text(),
44+
_SUMMARIES: tfds.features.Text(),
45+
}),
46+
supervised_keys=(_REVIEW_SENTS, _SUMMARIES),
47+
homepage='http://kavita-ganesan.com/opinosis/#.XeTZopNKhTY',
48+
citation=_CITATION,
49+
)
50+
51+
def _split_generators(self, dl_manager):
52+
"""Returns SplitGenerators."""
53+
extract_path = dl_manager.download_and_extract(_URL)
54+
return [
55+
tfds.core.SplitGenerator(
56+
name=tfds.Split.TEST,
57+
gen_kwargs={"path": extract_path},
58+
),
59+
]
60+
61+
def _generate_examples(self, path=None):
62+
"""Yields examples."""
63+
topics_path = os.path.join(path, "topics")
64+
filenames = tf.io.gfile.listdir(topics_path)
65+
for i, filename in enumerate(filenames):
66+
file_path = os.path.join(topics_path, filename)
67+
topic_name = filename.split(".txt")[0]
68+
with tf.io.gfile.GFile(file_path, "rb") as src_f:
69+
lines = str(src_f.readlines())
70+
input_data = "".join(lines)
71+
summaries_path = os.path.join(path, "summaries-gold", topic_name)
72+
summary_lst = []
73+
for summ_filename in tf.io.gfile.listdir(summaries_path):
74+
file_path = os.path.join(summaries_path, summ_filename)
75+
file_path = os.path.join(summaries_path, summ_filename)
76+
with tf.io.gfile.GFile(file_path) as tgt_f:
77+
lines = tgt_f.readlines()
78+
data = "".join(lines)
79+
summary_lst.append(data)
80+
summary_data = "[SEP_SUM]".join(summary_lst)
81+
yield i, {
82+
_REVIEW_SENTS: input_data,
83+
_SUMMARIES: summary_data
84+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
"""Test for Opinosis Opinion Dataset"""
2+
3+
from __future__ import absolute_import
4+
from __future__ import division
5+
from __future__ import print_function
6+
7+
from tensorflow_datasets import testing
8+
from tensorflow_datasets.summarization import opinosis
9+
10+
11+
class OpinosisTest(testing.DatasetBuilderTestCase):
12+
DATASET_CLASS = opinosis.Opinosis
13+
SPLITS = {
14+
"test": 2, # Number of fake test example
15+
}
16+
DL_EXTRACT_RESULT = ""
17+
18+
19+
if __name__ == "__main__":
20+
testing.test_main()
21+
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
This is a gold summary for topic 1.
2+
Sentences in gold summaries are separated by newlines.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
This is another gold summary for topic 1.
2+
Sentences in gold summaries are separated by newlines.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
This is a gold summary for topic 2.
2+
Sentences in gold summaries are separated by newlines.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
This is another gold summary for topic 2.
2+
Sentences in gold summaries are separated by newlines.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
This is another gold summary for topic 2.
2+
Sentences in gold summaries are separated by newlines.
3+
Topics have a variable number of gold summaries.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
This is a fake topic.
2+
The topics have multiple sentence inputs.

0 commit comments

Comments
 (0)