Skip to content

Commit 828684e

Browse files
TensorFlow Datasets Teamcopybara-github
authored andcommitted
Add mathematical dataset
PiperOrigin-RevId: 280337000
1 parent ac745a3 commit 828684e

File tree

8 files changed

+355
-0
lines changed

8 files changed

+355
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
-1022460818 divided by 2676599
2+
-382
3+
What is -1 divided by -13692346004?
4+
1/13692346004
5+
Divide 1136975704 by -142121963.
6+
-8
7+
Divide 37464710 by -1651.
8+
-37464710/1651
9+
56796887 divided by 7
10+
8113841
11+
Calculate 1691 divided by -4109399.
12+
-1691/4109399
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
202 divided by -50133602
2+
-101/25066801
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
202 divided by -50133602
2+
-101/25066801
3+
Calculate -90176 divided by -1017273.
4+
90176/1017273
5+
Calculate -717706881 divided by 3.
6+
-239235627
7+
Divide 1380457090 by 39.
8+
1380457090/39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Divide 1380457090 by 39.
2+
1380457090/39

tensorflow_datasets/text/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from tensorflow_datasets.text.imdb import IMDBReviewsConfig
2424
from tensorflow_datasets.text.lm1b import Lm1b
2525
from tensorflow_datasets.text.lm1b import Lm1bConfig
26+
from tensorflow_datasets.text.math_dataset import MathDataset
2627
from tensorflow_datasets.text.multi_nli import MultiNLI
2728
from tensorflow_datasets.text.multi_nli_mismatch import MultiNLIMismatch
2829
from tensorflow_datasets.text.snli import Snli
Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
# coding=utf-8
2+
# Copyright 2019 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Mathematics database."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
import os
23+
from absl import logging
24+
import tensorflow as tf
25+
import tensorflow_datasets.public_api as tfds
26+
27+
_CITATION = """
28+
@article{2019arXiv,
29+
author = {Saxton, Grefenstette, Hill, Kohli},
30+
title = {Analysing Mathematical Reasoning Abilities of Neural Models},
31+
year = {2019},
32+
journal = {arXiv:1904.01557}
33+
}
34+
"""
35+
36+
_DESCRIPTION = """
37+
Mathematics database.
38+
39+
This dataset code generates mathematical question and answer pairs,
40+
from a range of question types at roughly school-level difficulty.
41+
This is designed to test the mathematical learning and algebraic
42+
reasoning skills of learning models.
43+
44+
Original paper: Analysing Mathematical Reasoning Abilities of Neural Models
45+
(Saxton, Grefenstette, Hill, Kohli).
46+
47+
Example usage:
48+
train_examples, val_examples = tfds.load(
49+
'math_dataset/arithmetic__mul',
50+
split=['train', 'test'],
51+
as_supervised=True)
52+
"""
53+
54+
_DATA_URL = "https://storage.googleapis.com/mathematics-dataset/mathematics_dataset-v1.0.tar.gz"
55+
56+
_TRAIN_CATEGORY = [
57+
"train-easy",
58+
"train-medium",
59+
"train-hard",
60+
]
61+
62+
_INTERPOLATE_CATEGORY = [
63+
"interpolate",
64+
]
65+
66+
_MODULES = [
67+
# extrapolate
68+
"algebra__polynomial_roots_big",
69+
"arithmetic__add_or_sub_big",
70+
"arithmetic__add_sub_multiple_longer",
71+
"arithmetic__div_big",
72+
"arithmetic__mixed_longer",
73+
"arithmetic__mul_big",
74+
"arithmetic__mul_div_multiple_longer",
75+
"comparison__closest_more",
76+
"comparison__kth_biggest_more",
77+
"comparison__sort_more",
78+
"measurement__conversion",
79+
"numbers__place_value_big",
80+
"numbers__round_number_big",
81+
"probability__swr_p_level_set_more_samples",
82+
"probability__swr_p_sequence_more_samples",
83+
84+
# interpolate
85+
"algebra__linear_1d",
86+
"algebra__linear_1d_composed",
87+
"algebra__linear_2d",
88+
"algebra__linear_2d_composed",
89+
"algebra__polynomial_roots",
90+
"algebra__polynomial_roots_composed",
91+
"algebra__sequence_next_term",
92+
"algebra__sequence_nth_term",
93+
"arithmetic__add_or_sub",
94+
"arithmetic__add_or_sub_in_base",
95+
"arithmetic__add_sub_multiple",
96+
"arithmetic__div",
97+
"arithmetic__mixed",
98+
"arithmetic__mul",
99+
"arithmetic__mul_div_multiple",
100+
"arithmetic__nearest_integer_root",
101+
"arithmetic__simplify_surd",
102+
"calculus__differentiate",
103+
"calculus__differentiate_composed",
104+
"comparison__closest",
105+
"comparison__closest_composed",
106+
"comparison__kth_biggest",
107+
"comparison__kth_biggest_composed",
108+
"comparison__pair",
109+
"comparison__pair_composed",
110+
"comparison__sort",
111+
"comparison__sort_composed",
112+
"measurement__conversion",
113+
"measurement__time",
114+
"numbers__base_conversion",
115+
"numbers__div_remainder",
116+
"numbers__div_remainder_composed",
117+
"numbers__gcd",
118+
"numbers__gcd_composed",
119+
"numbers__is_factor",
120+
"numbers__is_factor_composed",
121+
"numbers__is_prime",
122+
"numbers__is_prime_composed",
123+
"numbers__lcm",
124+
"numbers__lcm_composed",
125+
"numbers__list_prime_factors",
126+
"numbers__list_prime_factors_composed",
127+
"numbers__place_value",
128+
"numbers__place_value_composed",
129+
"numbers__round_number",
130+
"numbers__round_number_composed",
131+
"polynomials__add",
132+
"polynomials__coefficient_named",
133+
"polynomials__collect",
134+
"polynomials__compose",
135+
"polynomials__evaluate",
136+
"polynomials__evaluate_composed",
137+
"polynomials__expand",
138+
"polynomials__simplify_power",
139+
"probability__swr_p_level_set",
140+
"probability__swr_p_sequence",
141+
142+
# train-easy train-medium train-hard
143+
"algebra__linear_1d",
144+
"algebra__linear_1d_composed",
145+
"algebra__linear_2d",
146+
"algebra__linear_2d_composed",
147+
"algebra__polynomial_roots",
148+
"algebra__polynomial_roots_composed",
149+
"algebra__sequence_next_term",
150+
"algebra__sequence_nth_term",
151+
"arithmetic__add_or_sub",
152+
"arithmetic__add_or_sub_in_base",
153+
"arithmetic__add_sub_multiple",
154+
"arithmetic__div",
155+
"arithmetic__mixed",
156+
"arithmetic__mul",
157+
"arithmetic__mul_div_multiple",
158+
"arithmetic__nearest_integer_root",
159+
"arithmetic__simplify_surd",
160+
"calculus__differentiate",
161+
"calculus__differentiate_composed",
162+
"comparison__closest",
163+
"comparison__closest_composed",
164+
"comparison__kth_biggest",
165+
"comparison__kth_biggest_composed",
166+
"comparison__pair",
167+
"comparison__pair_composed",
168+
"comparison__sort",
169+
"comparison__sort_composed",
170+
"measurement__conversion",
171+
"measurement__time",
172+
"numbers__base_conversion",
173+
"numbers__div_remainder",
174+
"numbers__div_remainder_composed",
175+
"numbers__gcd",
176+
"numbers__gcd_composed",
177+
"numbers__is_factor",
178+
"numbers__is_factor_composed",
179+
"numbers__is_prime",
180+
"numbers__is_prime_composed",
181+
"numbers__lcm",
182+
"numbers__lcm_composed",
183+
"numbers__list_prime_factors",
184+
"numbers__list_prime_factors_composed",
185+
"numbers__place_value",
186+
"numbers__place_value_composed",
187+
"numbers__round_number",
188+
"numbers__round_number_composed",
189+
"polynomials__add",
190+
"polynomials__coefficient_named",
191+
"polynomials__collect",
192+
"polynomials__compose",
193+
"polynomials__evaluate",
194+
"polynomials__evaluate_composed",
195+
"polynomials__expand",
196+
"polynomials__simplify_power",
197+
"probability__swr_p_level_set",
198+
"probability__swr_p_sequence",
199+
]
200+
201+
_QUESTION = "question"
202+
_ANSWER = "answer"
203+
204+
_DATASET_VERSION = "mathematics_dataset-v1.0"
205+
206+
207+
def _generate_builder_configs():
208+
"""Generate configs with different subsets of mathematics dataset."""
209+
configs = []
210+
for module in set(_MODULES):
211+
configs.append(
212+
tfds.core.BuilderConfig(
213+
name=module,
214+
version=tfds.core.Version("1.0.0"),
215+
description=_DESCRIPTION,
216+
))
217+
218+
return configs
219+
220+
221+
class MathDataset(tfds.core.GeneratorBasedBuilder):
222+
"""Math Dataset."""
223+
224+
BUILDER_CONFIGS = _generate_builder_configs()
225+
226+
def _info(self):
227+
return tfds.core.DatasetInfo(
228+
builder=self,
229+
description=_DESCRIPTION,
230+
features=tfds.features.FeaturesDict({
231+
_QUESTION: tfds.features.Text(),
232+
_ANSWER: tfds.features.Text(),
233+
}),
234+
supervised_keys=(_QUESTION, _ANSWER),
235+
homepage="https://github.com/deepmind/mathematics_dataset",
236+
citation=_CITATION,
237+
)
238+
239+
def _read_data_from_all_categories(self, directory, config, categories):
240+
lines = []
241+
for category in categories:
242+
data_file = os.path.join(directory, _DATASET_VERSION, category, config)
243+
if tf.io.gfile.exists(data_file):
244+
with tf.io.gfile.GFile(data_file) as f:
245+
ls = f.read().split("\n")
246+
247+
for l in ls[::-1]:
248+
if not l:
249+
ls.remove(l)
250+
251+
lines.extend(ls)
252+
253+
return lines
254+
255+
def _split_generators(self, dl_manager):
256+
"""Returns SplitGenerators."""
257+
258+
directory = dl_manager.download_and_extract(_DATA_URL)
259+
config = self.builder_config.name + ".txt"
260+
261+
return [
262+
tfds.core.SplitGenerator(
263+
name=tfds.Split.TRAIN,
264+
gen_kwargs={
265+
"directory": directory,
266+
"config": config,
267+
"categories": _TRAIN_CATEGORY,
268+
}),
269+
tfds.core.SplitGenerator(
270+
name=tfds.Split.TEST,
271+
gen_kwargs={
272+
"directory": directory,
273+
"config": config,
274+
"categories": _INTERPOLATE_CATEGORY,
275+
}),
276+
]
277+
278+
def _generate_examples(self, directory, config, categories):
279+
"""Yields examples based on directory, module file.."""
280+
281+
lines = self._read_data_from_all_categories(directory, config, categories)
282+
logging.info("%s: %s contains total: %d", categories, config, len(lines))
283+
questions = lines[::2]
284+
answers = lines[1::2]
285+
286+
assert len(answers) == len(
287+
questions), "answers: %d do not match questions: %d" % (len(answers),
288+
len(questions))
289+
290+
for idx, (q, a) in enumerate(zip(questions, answers)):
291+
result = {_QUESTION: q, _ANSWER: a}
292+
if all(result.values()):
293+
yield idx, result
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# coding=utf-8
2+
# Copyright 2019 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Tests for Mathematical dataset."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
from tensorflow_datasets import testing
23+
from tensorflow_datasets.text import math_dataset
24+
25+
26+
class MathDatasetTest(testing.DatasetBuilderTestCase):
27+
DATASET_CLASS = math_dataset.MathDataset
28+
BUILDER_CONFIG_NAMES_TO_TEST = ["arithmetic__div_big"]
29+
SPLITS = {
30+
"train": 6, # Number of fake train example pairs
31+
"test": 6, # Number of fake test example pairs
32+
}
33+
34+
35+
if __name__ == "__main__":
36+
testing.test_main()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
https://storage.googleapis.com/mathematics-dataset/mathematics_dataset-v1.0.tar.gz 2333082954 def638343403cb9ed60437d6b684c859dd23b72779f5cc5661b0a31e67c58576

0 commit comments

Comments
 (0)