Skip to content

Commit 580a7f5

Browse files
TensorFlow Datasets Teamcopybara-github
authored andcommitted
Adding SCAN dataset to TFDS
PiperOrigin-RevId: 289736607
1 parent 1e820d0 commit 580a7f5

File tree

7 files changed

+189
-0
lines changed

7 files changed

+189
-0
lines changed

docs/release_notes.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,5 @@
1717
[VTAB benchmark](https://arxiv.org/abs/1910.04867).
1818
* Add e-SNLI dataset from the paper
1919
[e-SNLI](http://papers.nips.cc/paper/8163-e-snli-natural-language-inference-with-natural-language-explanations.pdf).
20+
* Add SCAN dataset introduced
21+
[here](https://arxiv.org/pdf/1711.00350.pdf).
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
IN: walk left OUT: I_TURN_LEFT I_WALK
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
IN: jump left OUT: I_TURN_LEFT I_JUMP
2+
IN: walk twice OUT: I_WALK I_WALK
3+
IN: jump twice OUT: I_JUMP I_JUMP

tensorflow_datasets/text/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from tensorflow_datasets.text.math_dataset import MathDataset
2929
from tensorflow_datasets.text.multi_nli import MultiNLI
3030
from tensorflow_datasets.text.multi_nli_mismatch import MultiNLIMismatch
31+
from tensorflow_datasets.text.scan import Scan
3132
from tensorflow_datasets.text.scicite import Scicite
3233
from tensorflow_datasets.text.snli import Snli
3334
from tensorflow_datasets.text.squad import Squad

tensorflow_datasets/text/scan.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# coding=utf-8
2+
# Copyright 2019 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""SCAN tasks with various different splits."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
import os
23+
import tensorflow as tf
24+
import tensorflow_datasets.public_api as tfds
25+
26+
_CITATION = """
27+
@inproceedings{Lake2018GeneralizationWS,
28+
title={Generalization without Systematicity: On the Compositional Skills of
29+
Sequence-to-Sequence Recurrent Networks},
30+
author={Brenden M. Lake and Marco Baroni},
31+
booktitle={ICML},
32+
year={2018},
33+
url={https://arxiv.org/pdf/1711.00350.pdf},
34+
}
35+
"""
36+
37+
_DESCRIPTION = """SCAN tasks with various splits.
38+
39+
SCAN is a set of simple language-driven navigation tasks for studying
40+
compositional learning and zero-shot generalization.
41+
42+
See https://github.com/brendenlake/SCAN for a description of the splits.
43+
44+
Example usage:
45+
data = tfds.load('scan/length')
46+
"""
47+
48+
_DATA_URL = 'https://github.com/brendenlake/SCAN/archive/master.zip'
49+
50+
51+
class ScanConfig(tfds.core.BuilderConfig):
52+
"""BuilderConfig for SCAN."""
53+
54+
@tfds.core.disallow_positional_args
55+
def __init__(self, name, directory=None, **kwargs):
56+
"""BuilderConfig for SCAN.
57+
58+
Args:
59+
name: Unique name of the split.
60+
directory: Which subdirectory to read the split from.
61+
**kwargs: keyword arguments forwarded to super.
62+
"""
63+
# Version history:
64+
super(ScanConfig, self).__init__(
65+
name=name,
66+
version=tfds.core.Version('1.0.0'),
67+
description=_DESCRIPTION,
68+
**kwargs)
69+
if directory is None:
70+
self.directory = name + '_split'
71+
else:
72+
self.directory = directory
73+
74+
75+
_COMMANDS = 'commands'
76+
_ACTIONS = 'actions'
77+
78+
79+
class Scan(tfds.core.GeneratorBasedBuilder):
80+
"""SCAN task / splits as proposed by Brenden M. Lake and Marco Baroni."""
81+
82+
BUILDER_CONFIGS = [
83+
ScanConfig(name='simple'),
84+
ScanConfig(name='addprim_jump', directory='add_prim_split'),
85+
ScanConfig(name='addprim_turn_left', directory='add_prim_split'),
86+
ScanConfig(name='filler_num0', directory='filler_split'),
87+
ScanConfig(name='filler_num1', directory='filler_split'),
88+
ScanConfig(name='filler_num2', directory='filler_split'),
89+
ScanConfig(name='filler_num3', directory='filler_split'),
90+
ScanConfig(name='length'),
91+
ScanConfig(name='template_around_right', directory='template_split'),
92+
ScanConfig(name='template_jump_around_right', directory='template_split'),
93+
ScanConfig(name='template_opposite_right', directory='template_split'),
94+
ScanConfig(name='template_right', directory='template_split'),
95+
]
96+
97+
def _info(self):
98+
return tfds.core.DatasetInfo(
99+
builder=self,
100+
description=_DESCRIPTION,
101+
features=tfds.features.FeaturesDict({
102+
_COMMANDS: tfds.features.Text(),
103+
_ACTIONS: tfds.features.Text(),
104+
}),
105+
supervised_keys=(_COMMANDS, _ACTIONS),
106+
homepage='https://github.com/brendenlake/SCAN',
107+
citation=_CITATION,
108+
)
109+
110+
def _split_generators(self, dl_manager):
111+
"""Returns SplitGenerators."""
112+
data_dir = dl_manager.download_and_extract(
113+
tfds.download.Resource(
114+
url=_DATA_URL,
115+
# Specify extract method manually as filename reported by github.com
116+
# misses the .zip extension so auto-detection doesn't work.
117+
extract_method=tfds.download.ExtractMethod.ZIP))
118+
data_dir = os.path.join(data_dir, 'SCAN-master',
119+
self.builder_config.directory)
120+
split = self.builder_config.name
121+
return [
122+
tfds.core.SplitGenerator(
123+
name=tfds.Split.TRAIN,
124+
gen_kwargs={
125+
'filepath':
126+
os.path.join(data_dir, 'tasks_train_' + split + '.txt')
127+
}),
128+
tfds.core.SplitGenerator(
129+
name=tfds.Split.TEST,
130+
gen_kwargs={
131+
'filepath':
132+
os.path.join(data_dir, 'tasks_test_' + split + '.txt')
133+
})
134+
]
135+
136+
def _generate_examples(self, filepath):
137+
"""Yields examples."""
138+
with tf.io.gfile.GFile(filepath) as infile:
139+
for i, line in enumerate(infile):
140+
if not line.startswith('IN: '):
141+
continue
142+
# Chop the prefix and split string between input and output
143+
commands, actions = line[len('IN: '):].strip().split(' OUT: ', 1)
144+
yield i, {_COMMANDS: commands, _ACTIONS: actions}

tensorflow_datasets/text/scan_test.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# coding=utf-8
2+
# Copyright 2019 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Tests for SCAN dataset module."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
from tensorflow_datasets import testing
23+
from tensorflow_datasets.text import scan
24+
25+
26+
class ScanTest(testing.DatasetBuilderTestCase):
27+
DATASET_CLASS = scan.Scan
28+
BUILDER_CONFIG_NAMES_TO_TEST = ["simple"]
29+
SPLITS = {
30+
"train": 3, # Number of fake train example
31+
"test": 1, # Number of fake test example
32+
}
33+
34+
35+
if __name__ == "__main__":
36+
testing.test_main()
37+
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
https://github.com/brendenlake/SCAN/archive/master.zip 18680912 ae9038488f880aedf9a5ed9e9693f22f0a3e33d6a4e3aa4d1e8006d9c038594a

0 commit comments

Comments
 (0)