Merge pull request #1252 from Alex-Fabbri:opinosis

copybara-github · copybara-github · commit 643ee98b67d0 · 2020-02-20T09:13:03.000-08:00
PiperOrigin-RevId: 296215231
diff --git a/docs/release_notes.md b/docs/release_notes.md
@@ -17,6 +17,7 @@
     [VTAB benchmark](https://arxiv.org/abs/1910.04867).
 *   Add e-SNLI dataset from the paper
     [e-SNLI](http://papers.nips.cc/paper/8163-e-snli-natural-language-inference-with-natural-language-explanations.pdf).
+*   Add [Opinosis dataset](https://www.aclweb.org/anthology/C10-1039.pdf).
 *   Add SCAN dataset introduced [here](https://arxiv.org/pdf/1711.00350.pdf).
 *   Add [Imagewang](https://github.com/fastai/imagenette) dataset.
 *   Add DIV2K dataset from the paper
diff --git a/tensorflow_datasets/summarization/__init__.py b/tensorflow_datasets/summarization/__init__.py
@@ -23,6 +23,7 @@
 from tensorflow_datasets.summarization.gigaword import Gigaword
 from tensorflow_datasets.summarization.multi_news import MultiNews
 from tensorflow_datasets.summarization.newsroom import Newsroom
+from tensorflow_datasets.summarization.opinosis import Opinosis
 from tensorflow_datasets.summarization.reddit_tifu import RedditTifu
 from tensorflow_datasets.summarization.scientific_papers import ScientificPapers
 from tensorflow_datasets.summarization.wikihow import Wikihow
diff --git a/tensorflow_datasets/summarization/opinosis.py b/tensorflow_datasets/summarization/opinosis.py
@@ -0,0 +1,94 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Opinosis Opinion Dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import tensorflow as tf
+import tensorflow_datasets.public_api as tfds
+
+_CITATION = """
+@inproceedings{ganesan2010opinosis,
+  title={Opinosis: a graph-based approach to abstractive summarization of highly redundant opinions},
+  author={Ganesan, Kavita and Zhai, ChengXiang and Han, Jiawei},
+  booktitle={Proceedings of the 23rd International Conference on Computational Linguistics},
+  pages={340--348},
+  year={2010},
+  organization={Association for Computational Linguistics}
+}
+"""
+
+_DESCRIPTION = """
+The Opinosis Opinion Dataset consists of sentences extracted from reviews for 51 topics.
+Topics and opinions are obtained from Tripadvisor, Edmunds.com and Amazon.com.
+"""
+
+_URL = "https://github.com/kavgan/opinosis-summarization/raw/master/OpinosisDataset1.0_0.zip"
+
+_REVIEW_SENTS = "review_sents"
+_SUMMARIES = "summaries"
+
+
+class Opinosis(tfds.core.GeneratorBasedBuilder):
+  """Opinosis Opinion Dataset."""
+
+  VERSION = tfds.core.Version("1.0.0")
+
+  def _info(self):
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            _REVIEW_SENTS: tfds.features.Text(),
+            _SUMMARIES: tfds.features.Sequence(tfds.features.Text())
+        }),
+        supervised_keys=(_REVIEW_SENTS, _SUMMARIES),
+        homepage="http://kavita-ganesan.com/opinosis/",
+        citation=_CITATION,
+    )
+
+  def _split_generators(self, dl_manager):
+    """Returns SplitGenerators."""
+    extract_path = dl_manager.download_and_extract(_URL)
+    return [
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TRAIN,
+            gen_kwargs={"path": extract_path},
+        ),
+    ]
+
+  def _generate_examples(self, path=None):
+    """Yields examples."""
+    topics_path = os.path.join(path, "topics")
+    filenames = tf.io.gfile.listdir(topics_path)
+    for filename in filenames:
+      file_path = os.path.join(topics_path, filename)
+      topic_name = filename.split(".txt")[0]
+      with tf.io.gfile.GFile(file_path, "rb") as src_f:
+        input_data = src_f.read()
+      summaries_path = os.path.join(path, "summaries-gold", topic_name)
+      summary_lst = []
+      for summ_filename in sorted(tf.io.gfile.listdir(summaries_path)):
+        file_path = os.path.join(summaries_path, summ_filename)
+        with tf.io.gfile.GFile(file_path, "rb") as tgt_f:
+          data = tgt_f.read().strip()
+          summary_lst.append(data)
+      summary_data = summary_lst
+      yield filename, {_REVIEW_SENTS: input_data, _SUMMARIES: summary_data}
diff --git a/tensorflow_datasets/summarization/opinosis_test.py b/tensorflow_datasets/summarization/opinosis_test.py
@@ -0,0 +1,35 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test for Opinosis Opinion Dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow_datasets import testing
+from tensorflow_datasets.summarization import opinosis
+
+
+class OpinosisTest(testing.DatasetBuilderTestCase):
+  DATASET_CLASS = opinosis.Opinosis
+  SPLITS = {
+      "train": 2,  # Number of fake test example
+  }
+  DL_EXTRACT_RESULT = ""
+
+
+if __name__ == "__main__":
+  testing.test_main()
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/opinosis/summaries-gold/topic1/topic1.1.gold b/tensorflow_datasets/testing/test_data/fake_examples/opinosis/summaries-gold/topic1/topic1.1.gold
@@ -0,0 +1,2 @@
+This is a gold summary for topic 1. 
+Sentences in gold summaries are separated by newlines.
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/opinosis/summaries-gold/topic1/topic1.2.gold b/tensorflow_datasets/testing/test_data/fake_examples/opinosis/summaries-gold/topic1/topic1.2.gold
@@ -0,0 +1,2 @@
+This is another gold summary for topic 1. 
+Sentences in gold summaries are separated by newlines.
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/opinosis/summaries-gold/topic2/topic2.1.gold b/tensorflow_datasets/testing/test_data/fake_examples/opinosis/summaries-gold/topic2/topic2.1.gold
@@ -0,0 +1,2 @@
+This is a gold summary for topic 2. 
+Sentences in gold summaries are separated by newlines.
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/opinosis/summaries-gold/topic2/topic2.2.gold b/tensorflow_datasets/testing/test_data/fake_examples/opinosis/summaries-gold/topic2/topic2.2.gold
@@ -0,0 +1,2 @@
+This is another gold summary for topic 2. 
+Sentences in gold summaries are separated by newlines.
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/opinosis/summaries-gold/topic2/topic2.3.gold b/tensorflow_datasets/testing/test_data/fake_examples/opinosis/summaries-gold/topic2/topic2.3.gold
@@ -0,0 +1,3 @@
+This is another gold summary for topic 2. 
+Sentences in gold summaries are separated by newlines.
+Topics have a variable number of gold summaries.
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/opinosis/topics/topic1.txt.data b/tensorflow_datasets/testing/test_data/fake_examples/opinosis/topics/topic1.txt.data
@@ -0,0 +1,2 @@
+This is a fake topic. 
+The topics have multiple sentence inputs. 
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/opinosis/topics/topic2.txt.data b/tensorflow_datasets/testing/test_data/fake_examples/opinosis/topics/topic2.txt.data
@@ -0,0 +1,2 @@
+This is the second fake topic. 
+The topics have multiple sentence inputs. 
diff --git a/tensorflow_datasets/url_checksums/opinosis.txt b/tensorflow_datasets/url_checksums/opinosis.txt
@@ -0,0 +1 @@
+https://github.com/kavgan/opinosis-summarization/raw/master/OpinosisDataset1.0_0.zip 757398 ed0be6c80fe32e9071ddd7fbb7a272fea7efa39b6b3fa77aa1460c7b81026547

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+This is a gold summary for topic 1.`
	`2`	`+Sentences in gold summaries are separated by newlines.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+This is another gold summary for topic 1.`
	`2`	`+Sentences in gold summaries are separated by newlines.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+This is a gold summary for topic 2.`
	`2`	`+Sentences in gold summaries are separated by newlines.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+This is another gold summary for topic 2.`
	`2`	`+Sentences in gold summaries are separated by newlines.`
	`3`	`+Topics have a variable number of gold summaries.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+This is a fake topic.`
	`2`	`+The topics have multiple sentence inputs.`