Add mathematical dataset

TensorFlow Datasets Team · copybara-github · commit 828684e12d69 · 2019-11-13T20:02:30.000-08:00
PiperOrigin-RevId: 280337000
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/math_dataset/mathematics_dataset-v1.0/interpolate/arithmetic__div_big.txt b/tensorflow_datasets/testing/test_data/fake_examples/math_dataset/mathematics_dataset-v1.0/interpolate/arithmetic__div_big.txt
@@ -0,0 +1,12 @@
+-1022460818 divided by 2676599
+-382
+What is -1 divided by -13692346004?
+1/13692346004
+Divide 1136975704 by -142121963.
+-8
+Divide 37464710 by -1651.
+-37464710/1651
+56796887 divided by 7
+8113841
+Calculate 1691 divided by -4109399.
+-1691/4109399
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/math_dataset/mathematics_dataset-v1.0/train-easy/arithmetic__div_big.txt b/tensorflow_datasets/testing/test_data/fake_examples/math_dataset/mathematics_dataset-v1.0/train-easy/arithmetic__div_big.txt
@@ -0,0 +1,2 @@
+202 divided by -50133602
+-101/25066801
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/math_dataset/mathematics_dataset-v1.0/train-hard/arithmetic__div_big.txt b/tensorflow_datasets/testing/test_data/fake_examples/math_dataset/mathematics_dataset-v1.0/train-hard/arithmetic__div_big.txt
@@ -0,0 +1,8 @@
+202 divided by -50133602
+-101/25066801
+Calculate -90176 divided by -1017273.
+90176/1017273
+Calculate -717706881 divided by 3.
+-239235627
+Divide 1380457090 by 39.
+1380457090/39
diff --git a/tensorflow_datasets/testing/test_data/fake_examples/math_dataset/mathematics_dataset-v1.0/train-medium/arithmetic__div_big.txt b/tensorflow_datasets/testing/test_data/fake_examples/math_dataset/mathematics_dataset-v1.0/train-medium/arithmetic__div_big.txt
@@ -0,0 +1,2 @@
+Divide 1380457090 by 39.
+1380457090/39
diff --git a/tensorflow_datasets/text/__init__.py b/tensorflow_datasets/text/__init__.py
@@ -23,6 +23,7 @@
 from tensorflow_datasets.text.imdb import IMDBReviewsConfig
 from tensorflow_datasets.text.lm1b import Lm1b
 from tensorflow_datasets.text.lm1b import Lm1bConfig
+from tensorflow_datasets.text.math_dataset import MathDataset
 from tensorflow_datasets.text.multi_nli import MultiNLI
 from tensorflow_datasets.text.multi_nli_mismatch import MultiNLIMismatch
 from tensorflow_datasets.text.snli import Snli
diff --git a/tensorflow_datasets/text/math_dataset.py b/tensorflow_datasets/text/math_dataset.py
@@ -0,0 +1,293 @@
+# coding=utf-8
+# Copyright 2019 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Mathematics database."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from absl import logging
+import tensorflow as tf
+import tensorflow_datasets.public_api as tfds
+
+_CITATION = """
+@article{2019arXiv,
+  author = {Saxton, Grefenstette, Hill, Kohli},
+  title = {Analysing Mathematical Reasoning Abilities of Neural Models},
+  year = {2019},
+  journal = {arXiv:1904.01557}
+}
+"""
+
+_DESCRIPTION = """
+Mathematics database.
+
+This dataset code generates mathematical question and answer pairs,
+from a range of question types at roughly school-level difficulty.
+This is designed to test the mathematical learning and algebraic
+reasoning skills of learning models.
+
+Original paper: Analysing Mathematical Reasoning Abilities of Neural Models
+(Saxton, Grefenstette, Hill, Kohli).
+
+Example usage:
+train_examples, val_examples = tfds.load(
+    'math_dataset/arithmetic__mul',
+    split=['train', 'test'],
+    as_supervised=True)
+"""
+
+_DATA_URL = "https://storage.googleapis.com/mathematics-dataset/mathematics_dataset-v1.0.tar.gz"
+
+_TRAIN_CATEGORY = [
+    "train-easy",
+    "train-medium",
+    "train-hard",
+]
+
+_INTERPOLATE_CATEGORY = [
+    "interpolate",
+]
+
+_MODULES = [
+    # extrapolate
+    "algebra__polynomial_roots_big",
+    "arithmetic__add_or_sub_big",
+    "arithmetic__add_sub_multiple_longer",
+    "arithmetic__div_big",
+    "arithmetic__mixed_longer",
+    "arithmetic__mul_big",
+    "arithmetic__mul_div_multiple_longer",
+    "comparison__closest_more",
+    "comparison__kth_biggest_more",
+    "comparison__sort_more",
+    "measurement__conversion",
+    "numbers__place_value_big",
+    "numbers__round_number_big",
+    "probability__swr_p_level_set_more_samples",
+    "probability__swr_p_sequence_more_samples",
+
+    # interpolate
+    "algebra__linear_1d",
+    "algebra__linear_1d_composed",
+    "algebra__linear_2d",
+    "algebra__linear_2d_composed",
+    "algebra__polynomial_roots",
+    "algebra__polynomial_roots_composed",
+    "algebra__sequence_next_term",
+    "algebra__sequence_nth_term",
+    "arithmetic__add_or_sub",
+    "arithmetic__add_or_sub_in_base",
+    "arithmetic__add_sub_multiple",
+    "arithmetic__div",
+    "arithmetic__mixed",
+    "arithmetic__mul",
+    "arithmetic__mul_div_multiple",
+    "arithmetic__nearest_integer_root",
+    "arithmetic__simplify_surd",
+    "calculus__differentiate",
+    "calculus__differentiate_composed",
+    "comparison__closest",
+    "comparison__closest_composed",
+    "comparison__kth_biggest",
+    "comparison__kth_biggest_composed",
+    "comparison__pair",
+    "comparison__pair_composed",
+    "comparison__sort",
+    "comparison__sort_composed",
+    "measurement__conversion",
+    "measurement__time",
+    "numbers__base_conversion",
+    "numbers__div_remainder",
+    "numbers__div_remainder_composed",
+    "numbers__gcd",
+    "numbers__gcd_composed",
+    "numbers__is_factor",
+    "numbers__is_factor_composed",
+    "numbers__is_prime",
+    "numbers__is_prime_composed",
+    "numbers__lcm",
+    "numbers__lcm_composed",
+    "numbers__list_prime_factors",
+    "numbers__list_prime_factors_composed",
+    "numbers__place_value",
+    "numbers__place_value_composed",
+    "numbers__round_number",
+    "numbers__round_number_composed",
+    "polynomials__add",
+    "polynomials__coefficient_named",
+    "polynomials__collect",
+    "polynomials__compose",
+    "polynomials__evaluate",
+    "polynomials__evaluate_composed",
+    "polynomials__expand",
+    "polynomials__simplify_power",
+    "probability__swr_p_level_set",
+    "probability__swr_p_sequence",
+
+    # train-easy train-medium train-hard
+    "algebra__linear_1d",
+    "algebra__linear_1d_composed",
+    "algebra__linear_2d",
+    "algebra__linear_2d_composed",
+    "algebra__polynomial_roots",
+    "algebra__polynomial_roots_composed",
+    "algebra__sequence_next_term",
+    "algebra__sequence_nth_term",
+    "arithmetic__add_or_sub",
+    "arithmetic__add_or_sub_in_base",
+    "arithmetic__add_sub_multiple",
+    "arithmetic__div",
+    "arithmetic__mixed",
+    "arithmetic__mul",
+    "arithmetic__mul_div_multiple",
+    "arithmetic__nearest_integer_root",
+    "arithmetic__simplify_surd",
+    "calculus__differentiate",
+    "calculus__differentiate_composed",
+    "comparison__closest",
+    "comparison__closest_composed",
+    "comparison__kth_biggest",
+    "comparison__kth_biggest_composed",
+    "comparison__pair",
+    "comparison__pair_composed",
+    "comparison__sort",
+    "comparison__sort_composed",
+    "measurement__conversion",
+    "measurement__time",
+    "numbers__base_conversion",
+    "numbers__div_remainder",
+    "numbers__div_remainder_composed",
+    "numbers__gcd",
+    "numbers__gcd_composed",
+    "numbers__is_factor",
+    "numbers__is_factor_composed",
+    "numbers__is_prime",
+    "numbers__is_prime_composed",
+    "numbers__lcm",
+    "numbers__lcm_composed",
+    "numbers__list_prime_factors",
+    "numbers__list_prime_factors_composed",
+    "numbers__place_value",
+    "numbers__place_value_composed",
+    "numbers__round_number",
+    "numbers__round_number_composed",
+    "polynomials__add",
+    "polynomials__coefficient_named",
+    "polynomials__collect",
+    "polynomials__compose",
+    "polynomials__evaluate",
+    "polynomials__evaluate_composed",
+    "polynomials__expand",
+    "polynomials__simplify_power",
+    "probability__swr_p_level_set",
+    "probability__swr_p_sequence",
+]
+
+_QUESTION = "question"
+_ANSWER = "answer"
+
+_DATASET_VERSION = "mathematics_dataset-v1.0"
+
+
+def _generate_builder_configs():
+  """Generate configs with different subsets of mathematics dataset."""
+  configs = []
+  for module in set(_MODULES):
+    configs.append(
+        tfds.core.BuilderConfig(
+            name=module,
+            version=tfds.core.Version("1.0.0"),
+            description=_DESCRIPTION,
+        ))
+
+  return configs
+
+
+class MathDataset(tfds.core.GeneratorBasedBuilder):
+  """Math Dataset."""
+
+  BUILDER_CONFIGS = _generate_builder_configs()
+
+  def _info(self):
+    return tfds.core.DatasetInfo(
+        builder=self,
+        description=_DESCRIPTION,
+        features=tfds.features.FeaturesDict({
+            _QUESTION: tfds.features.Text(),
+            _ANSWER: tfds.features.Text(),
+        }),
+        supervised_keys=(_QUESTION, _ANSWER),
+        homepage="https://github.com/deepmind/mathematics_dataset",
+        citation=_CITATION,
+    )
+
+  def _read_data_from_all_categories(self, directory, config, categories):
+    lines = []
+    for category in categories:
+      data_file = os.path.join(directory, _DATASET_VERSION, category, config)
+      if tf.io.gfile.exists(data_file):
+        with tf.io.gfile.GFile(data_file) as f:
+          ls = f.read().split("\n")
+
+          for l in ls[::-1]:
+            if not l:
+              ls.remove(l)
+
+          lines.extend(ls)
+
+    return lines
+
+  def _split_generators(self, dl_manager):
+    """Returns SplitGenerators."""
+
+    directory = dl_manager.download_and_extract(_DATA_URL)
+    config = self.builder_config.name + ".txt"
+
+    return [
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TRAIN,
+            gen_kwargs={
+                "directory": directory,
+                "config": config,
+                "categories": _TRAIN_CATEGORY,
+            }),
+        tfds.core.SplitGenerator(
+            name=tfds.Split.TEST,
+            gen_kwargs={
+                "directory": directory,
+                "config": config,
+                "categories": _INTERPOLATE_CATEGORY,
+            }),
+    ]
+
+  def _generate_examples(self, directory, config, categories):
+    """Yields examples based on directory, module file.."""
+
+    lines = self._read_data_from_all_categories(directory, config, categories)
+    logging.info("%s: %s contains total: %d", categories, config, len(lines))
+    questions = lines[::2]
+    answers = lines[1::2]
+
+    assert len(answers) == len(
+        questions), "answers: %d do not match questions: %d" % (len(answers),
+                                                                len(questions))
+
+    for idx, (q, a) in enumerate(zip(questions, answers)):
+      result = {_QUESTION: q, _ANSWER: a}
+      if all(result.values()):
+        yield idx, result
diff --git a/tensorflow_datasets/text/math_dataset_test.py b/tensorflow_datasets/text/math_dataset_test.py
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright 2019 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Mathematical dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow_datasets import testing
+from tensorflow_datasets.text import math_dataset
+
+
+class MathDatasetTest(testing.DatasetBuilderTestCase):
+  DATASET_CLASS = math_dataset.MathDataset
+  BUILDER_CONFIG_NAMES_TO_TEST = ["arithmetic__div_big"]
+  SPLITS = {
+      "train": 6,  # Number of fake train example pairs
+      "test": 6,  # Number of fake test example pairs
+  }
+
+
+if __name__ == "__main__":
+  testing.test_main()
diff --git a/tensorflow_datasets/url_checksums/math_dataset.txt b/tensorflow_datasets/url_checksums/math_dataset.txt
@@ -0,0 +1 @@
+https://storage.googleapis.com/mathematics-dataset/mathematics_dataset-v1.0.tar.gz 2333082954 def638343403cb9ed60437d6b684c859dd23b72779f5cc5661b0a31e67c58576

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+202 divided by -50133602`
	`2`	`+-101/25066801`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Divide 1380457090 by 39.`
	`2`	`+1380457090/39`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+https://storage.googleapis.com/mathematics-dataset/mathematics_dataset-v1.0.tar.gz 2333082954 def638343403cb9ed60437d6b684c859dd23b72779f5cc5661b0a31e67c58576`