Skip to content

Commit 1375a51

Browse files
dfurrercopybara-github
authored andcommitted
Adding CFQ dataset to TFDS
PiperOrigin-RevId: 292090525
1 parent fca831c commit 1375a51

File tree

7 files changed

+192
-0
lines changed

7 files changed

+192
-0
lines changed

docs/release_notes.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,7 @@
2020
* Add SCAN dataset introduced [here](https://arxiv.org/pdf/1711.00350.pdf).
2121
* Add DIV2K dataset from the paper
2222
[DIV2K](http://www.vision.ee.ethz.ch/~timofter/publications/Agustsson-CVPRW-2017.pdf)
23+
* Add CFQ (Compositional Freebase Questions) dataset from
24+
[this paper](https://openreview.net/pdf?id=SygcCnNKwr).
25+
26+
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[
2+
{"questionPatternModEntities": "Who directed and produced M0?",
3+
"sparqlPatternModEntities": "SELECT /director M0 . /producer M0"},
4+
{"questionPatternModEntities": "Who directed and edited M0?",
5+
"sparqlPatternModEntities": "SELECT /director M0 . /editor M0"},
6+
{"questionPatternModEntities": "Who edited and directed M0?",
7+
"sparqlPatternModEntities": "SELECT /editor M0 . /director M0"},
8+
{"questionPatternModEntities": "Who produced and directed M0?",
9+
"sparqlPatternModEntities": "SELECT /producer M0 . /director M0 . "}
10+
]
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"trainIdxs": [0, 2, 3],
2+
"testIdxs": [1]}

tensorflow_datasets/text/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"""Text datasets."""
1717

1818
from tensorflow_datasets.text.c4 import C4
19+
from tensorflow_datasets.text.cfq import CFQ
1920
from tensorflow_datasets.text.civil_comments import CivilComments
2021
from tensorflow_datasets.text.cos_e import CosE
2122
from tensorflow_datasets.text.definite_pronoun_resolution import DefinitePronounResolution

tensorflow_datasets/text/cfq.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""CFQ (Compositional Freebase Question) dataset."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
import json
23+
import os
24+
from absl import logging
25+
import tensorflow as tf
26+
import tensorflow_datasets.public_api as tfds
27+
28+
_CITATION = """
29+
@inproceedings{Lake2018GeneralizationWS,
30+
title={Measuring Compositional Generalization: A Comprehensive Method on
31+
Realistic Data},
32+
author={Daniel Keysers, et al.},
33+
booktitle={ICLR},
34+
year={2020},
35+
url={https://arxiv.org/abs/1912.09713.pdf},
36+
}
37+
"""
38+
39+
_DESCRIPTION = """
40+
The CFQ dataset (and it's splits) for measuring compositional generalization.
41+
42+
See https://arxiv.org/abs/1912.09713.pdf for background.
43+
44+
Example usage:
45+
data = tfds.load('cfq/mcd1')
46+
"""
47+
48+
_DATA_URL = 'https://storage.googleapis.com/cfq_dataset/cfq.tar.gz'
49+
50+
51+
class CFQConfig(tfds.core.BuilderConfig):
52+
"""BuilderConfig for CFQ splits."""
53+
54+
@tfds.core.disallow_positional_args
55+
def __init__(self, name, directory='splits', **kwargs):
56+
"""BuilderConfig for CFQ.
57+
58+
Args:
59+
name: Unique name of the split.
60+
directory: Which subdirectory to read the split from.
61+
**kwargs: keyword arguments forwarded to super.
62+
"""
63+
# Version history:
64+
super(CFQConfig, self).__init__(
65+
name=name,
66+
version=tfds.core.Version('1.0.0'),
67+
description=_DESCRIPTION,
68+
**kwargs)
69+
self.split_file = os.path.join(directory, name + '.json')
70+
71+
72+
_QUESTION = 'question'
73+
_QUERY = 'query'
74+
75+
76+
class CFQ(tfds.core.GeneratorBasedBuilder):
77+
"""CFQ task / splits."""
78+
79+
BUILDER_CONFIGS = [
80+
CFQConfig(name='mcd1'),
81+
CFQConfig(name='mcd2'),
82+
CFQConfig(name='mcd3'),
83+
CFQConfig(name='question_complexity_split'),
84+
CFQConfig(name='question_pattern_split'),
85+
CFQConfig(name='query_complexity_split'),
86+
CFQConfig(name='query_pattern_split'),
87+
CFQConfig(name='random_split'),
88+
]
89+
90+
def _info(self):
91+
return tfds.core.DatasetInfo(
92+
builder=self,
93+
description=_DESCRIPTION,
94+
features=tfds.features.FeaturesDict({
95+
_QUESTION: tfds.features.Text(),
96+
_QUERY: tfds.features.Text(),
97+
}),
98+
supervised_keys=(_QUESTION, _QUERY),
99+
homepage='https://github.com/google-research/google-research/tree/master/cfq',
100+
citation=_CITATION,
101+
)
102+
103+
def _split_generators(self, dl_manager):
104+
"""Returns SplitGenerators."""
105+
data_dir = dl_manager.download_and_extract(_DATA_URL)
106+
data_dir = os.path.join(data_dir, 'cfq')
107+
return [
108+
tfds.core.SplitGenerator(
109+
name=tfds.Split.TRAIN,
110+
gen_kwargs={
111+
'base_directory': data_dir,
112+
'splits_file': self.builder_config.split_file,
113+
'split_id': 'trainIdxs'
114+
}),
115+
tfds.core.SplitGenerator(
116+
name=tfds.Split.TEST,
117+
gen_kwargs={
118+
'base_directory': data_dir,
119+
'splits_file': self.builder_config.split_file,
120+
'split_id': 'testIdxs'
121+
})
122+
]
123+
124+
def _generate_examples(self, base_directory, splits_file, split_id):
125+
"""Yields examples."""
126+
samples_path = os.path.join(base_directory, 'dataset.json')
127+
splits_path = os.path.join(base_directory, splits_file)
128+
with tf.io.gfile.GFile(samples_path) as samples_file:
129+
with tf.io.gfile.GFile(splits_path) as splits_file:
130+
logging.info('Reading json from %s into memory...', samples_path)
131+
samples = json.load(samples_file)
132+
logging.info('Loaded json data from %s.', samples_path)
133+
splits = json.load(splits_file)
134+
for idx in splits[split_id]:
135+
sample = samples[idx]
136+
yield idx, {_QUESTION: sample['questionPatternModEntities'],
137+
_QUERY: sample['sparqlPatternModEntities']}

tensorflow_datasets/text/cfq_test.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Tests for CFQ dataset module."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
from tensorflow_datasets import testing
23+
from tensorflow_datasets.text import cfq
24+
25+
26+
class CFQTest(testing.DatasetBuilderTestCase):
27+
DATASET_CLASS = cfq.CFQ
28+
BUILDER_CONFIG_NAMES_TO_TEST = ["mcd1"]
29+
SPLITS = {
30+
"train": 3, # Number of fake train example
31+
"test": 1, # Number of fake test example
32+
}
33+
34+
35+
if __name__ == "__main__":
36+
testing.test_main()
37+
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
https://storage.googleapis.com/cfq_dataset/cfq.tar.gz 267599061 979d719271eae12611643b89151f639d94092800e7e71f2d23a754c43f3eb1ba

0 commit comments

Comments
 (0)