Skip to content

Commit 399036f

Browse files
adarobcopybara-github
authored andcommitted
Add Natural Questions dataset.
PiperOrigin-RevId: 291762921
1 parent cf61ed6 commit 399036f

File tree

7 files changed

+283
-0
lines changed

7 files changed

+283
-0
lines changed

tensorflow_datasets/text/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from tensorflow_datasets.text.movie_rationales import MovieRationales
3232
from tensorflow_datasets.text.multi_nli import MultiNLI
3333
from tensorflow_datasets.text.multi_nli_mismatch import MultiNLIMismatch
34+
from tensorflow_datasets.text.natural_questions import NaturalQuestions
3435
from tensorflow_datasets.text.scan import Scan
3536
from tensorflow_datasets.text.scicite import Scicite
3637
from tensorflow_datasets.text.snli import Snli
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Natural Questions: A Benchmark for Question Answering Research."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
import json
23+
import re
24+
25+
import six
26+
import tensorflow.compat.v2 as tf
27+
import tensorflow_datasets.public_api as tfds
28+
29+
if six.PY2:
30+
import HTMLParser as html_parser # pylint:disable=g-import-not-at-top
31+
html_unescape = html_parser.HTMLParser().unescape
32+
else:
33+
import html # pylint:disable=g-import-not-at-top
34+
html_unescape = html.unescape
35+
36+
_CITATION = """
37+
@article{47761,
38+
title = {Natural Questions: a Benchmark for Question Answering Research},
39+
author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le and Slav Petrov},
40+
year = {2019},
41+
journal = {Transactions of the Association of Computational Linguistics}
42+
}
43+
"""
44+
45+
_DESCRIPTION = """
46+
The NQ corpus contains questions from real users, and it requires QA systems to
47+
read and comprehend an entire Wikipedia article that may or may not contain the
48+
answer to the question. The inclusion of real user questions, and the
49+
requirement that solutions should read an entire page to find the answer, cause
50+
NQ to be a more realistic and challenging task than prior QA datasets.
51+
"""
52+
53+
_URL = 'https://ai.google.com/research/NaturalQuestions/dataset'
54+
55+
_BASE_DOWNLOAD_URL = 'https://storage.googleapis.com/natural_questions/v1.0'
56+
_DOWNLOAD_URLS = {
57+
'train': [
58+
'%s/train/nq-train-%02d.jsonl.gz' % (_BASE_DOWNLOAD_URL, i)
59+
for i in range(50)
60+
],
61+
'validation': [
62+
'%s/dev/nq-dev-%02d.jsonl.gz' % (_BASE_DOWNLOAD_URL, i)
63+
for i in range(5)
64+
]
65+
}
66+
67+
68+
class NaturalQuestions(tfds.core.BeamBasedBuilder):
69+
"""Natural Questions: A Benchmark for Question Answering Research."""
70+
71+
VERSION = tfds.core.Version('0.0.1')
72+
73+
def _info(self):
74+
return tfds.core.DatasetInfo(
75+
builder=self,
76+
description=_DESCRIPTION,
77+
# TODO(adarob): Pull in NQ token features if needed.
78+
features=tfds.features.FeaturesDict({
79+
'id': tf.string,
80+
'document': {
81+
'title': tfds.features.Text(),
82+
'url': tfds.features.Text(),
83+
'html': tfds.features.Text(),
84+
},
85+
'question': {
86+
'text': tfds.features.Text(),
87+
},
88+
'annotations': tfds.features.Sequence({
89+
'id': tf.string,
90+
'long_answer': {
91+
'start_byte': tf.int64,
92+
'end_byte': tf.int64,
93+
},
94+
'short_answers': tfds.features.Sequence({
95+
'start_byte': tf.int64,
96+
'end_byte': tf.int64,
97+
'text': tfds.features.Text(),
98+
}),
99+
'yes_no_answer': tfds.features.ClassLabel(
100+
names=['NO', 'YES']) # Can also be -1 for NONE.
101+
}),
102+
}),
103+
supervised_keys=None,
104+
homepage=_URL,
105+
citation=_CITATION,
106+
)
107+
108+
def _split_generators(self, dl_manager):
109+
"""Returns SplitGenerators."""
110+
111+
files = dl_manager.download(_DOWNLOAD_URLS)
112+
113+
return [
114+
tfds.core.SplitGenerator(
115+
name=tfds.Split.TRAIN,
116+
gen_kwargs={'filepaths': files['train']},
117+
),
118+
tfds.core.SplitGenerator(
119+
name=tfds.Split.VALIDATION,
120+
num_shards=8,
121+
gen_kwargs={'filepaths': files['validation']},
122+
),
123+
]
124+
125+
def _build_pcollection(self, pipeline, filepaths):
126+
"""Build PCollection of examples."""
127+
beam = tfds.core.lazy_imports.apache_beam
128+
129+
def _parse_example(line):
130+
"""Parse a single json line and emit an example dict."""
131+
ex_json = json.loads(line)
132+
html_bytes = ex_json['document_html'].encode('utf-8')
133+
134+
def _parse_short_answer(short_ans):
135+
""""Extract text of short answer."""
136+
ans_bytes = html_bytes[
137+
short_ans['start_byte']:short_ans['end_byte']]
138+
# Remove non-breaking spaces.
139+
ans_bytes = ans_bytes.replace(b'\xc2\xa0', b' ')
140+
text = ans_bytes.decode('utf-8')
141+
# Remove HTML markup.
142+
text = re.sub('<([^>]*)>', '', html_unescape(text))
143+
# Replace \xa0 characters with spaces.
144+
return {
145+
'start_byte': short_ans['start_byte'],
146+
'end_byte': short_ans['end_byte'],
147+
'text': text
148+
}
149+
150+
def _parse_annotation(an_json):
151+
return {
152+
# Convert to str since some IDs cannot be represented by tf.int64.
153+
'id': str(an_json['annotation_id']),
154+
'long_answer': {
155+
'start_byte': an_json['long_answer']['start_byte'],
156+
'end_byte': an_json['long_answer']['end_byte'],
157+
},
158+
'short_answers': [
159+
_parse_short_answer(ans) for ans in an_json['short_answers']],
160+
'yes_no_answer': (
161+
-1 if an_json['yes_no_answer'] == 'NONE'
162+
else an_json['yes_no_answer'])
163+
}
164+
165+
beam.metrics.Metrics.counter('nq', 'examples').inc()
166+
# Convert to str since some IDs cannot be represented by tf.int64.
167+
id_ = str(ex_json['example_id'])
168+
return id_, {
169+
'id': id_,
170+
'document': {
171+
'title': ex_json['document_title'],
172+
'url': ex_json['document_url'],
173+
'html': html_bytes,
174+
},
175+
'question': {
176+
'text': ex_json['question_text']
177+
},
178+
'annotations': [
179+
_parse_annotation(an_json) for an_json in ex_json['annotations']
180+
]
181+
}
182+
183+
return (
184+
pipeline
185+
| beam.Create(filepaths)
186+
| beam.io.ReadAllFromText()
187+
| beam.Map(_parse_example))
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Tests for natural_questions dataset module."""
17+
18+
from __future__ import absolute_import
19+
from __future__ import division
20+
from __future__ import print_function
21+
22+
from tensorflow_datasets import testing
23+
from tensorflow_datasets.text import natural_questions
24+
25+
26+
class NaturalQuestionsTest(testing.DatasetBuilderTestCase):
27+
DATASET_CLASS = natural_questions.NaturalQuestions
28+
SPLITS = {
29+
"train": 3,
30+
"validation": 2,
31+
}
32+
33+
DL_EXTRACT_RESULT = {
34+
"train": ["nq-train-00.jsonl.gz", "nq-train-01.jsonl.gz"],
35+
"validation": ["nq-dev-00.jsonl.gz"],
36+
}
37+
38+
39+
if __name__ == "__main__":
40+
testing.test_main()
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
https://storage.googleapis.com/natural_questions/v1.0/dev/nq-dev-00.jsonl.gz 219593373 78a7f7899aa7d0bc9a29878cdb90daabbeda21a93e3730d8861f20ec736790b2
2+
https://storage.googleapis.com/natural_questions/v1.0/dev/nq-dev-01.jsonl.gz 200209706 9cebaa5eb69cf4ce067079370456b2939d4154a17da88faf73844d8c418cfb9e
3+
https://storage.googleapis.com/natural_questions/v1.0/dev/nq-dev-02.jsonl.gz 210446574 7b82aa74a35025ed91f514ad21e05c4a66cdec56ac1f6b77767a578156ff3bfc
4+
https://storage.googleapis.com/natural_questions/v1.0/dev/nq-dev-03.jsonl.gz 216859801 c7d45bb464bda3da7788c985b07def313ab5bed69bcc258acbe6f0918050bf6e
5+
https://storage.googleapis.com/natural_questions/v1.0/dev/nq-dev-04.jsonl.gz 220929521 00969275e9fb6a5dcc7e20ec9589c23ac00de61c979c8b957f4180b5b9a3043a
6+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-00.jsonl.gz 858728609 fb63ed2a5af2921898d566a4e8e514ed17bd079735f5a37f9b0c5e83ce087106
7+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-01.jsonl.gz 891498165 bbccdbc261ced6ee6351ede78c8be5af43d1024c72a60070ea658767d4c3023a
8+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-02.jsonl.gz 885374316 923afd3c645b0bd887f7b6a43c03889936226708ec7a66d83e5e5fa9cee98f4e
9+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-03.jsonl.gz 885313666 272b2fcdc37cf23ab4bcdf831a84e3b755da066ad4727cdded57a383a18f45de
10+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-04.jsonl.gz 890873425 8a9eb2dcf818ab7a44c4fa4b73112547e7f250ec85bdf83d2a3f32542fc3e8c2
11+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-05.jsonl.gz 873023109 2566560a3ad89300552385c3aba0cb51f9968083f01f04c494623542619cdaca
12+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-06.jsonl.gz 866509301 8ae5491a1d86fea5025e9ec27fed574fe5886fb36a7b3567ab0dba498603728d
13+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-07.jsonl.gz 838940867 7d1ee955d5a8dee1dc024e7b6a278314c85514f046d40d56ad5f1c2bb1fd794a
14+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-08.jsonl.gz 902610214 233ab07737289b4122d0fd2d2278dd4d7de3ef44d5b7d7e2e5abb79dbae55541
15+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-09.jsonl.gz 883494801 a1e546ee7db94117804c41c5fe80af91c78ee5b10878fc2714adb5322f56bb9b
16+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-10.jsonl.gz 876311133 0d27b7682c4ebc655e18eb9f8dcbb800ae1d5b09ef1183e29faa10168a015724
17+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-11.jsonl.gz 878127326 9b457cc0d4021da388c1322538b2b2140f0b2439c8eb056b5247c39ecb0de198
18+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-12.jsonl.gz 889257016 e3078d51686869be12343e1d02ae656577b290355d540870a370c58baeb89bc6
19+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-13.jsonl.gz 891769129 ff898b89d8423e4b5c9b35996fed80c8e1ddcc5f8a57c9af2a760d408bfa5df4
20+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-14.jsonl.gz 892523839 7f28f63e565bfa3b9013a62000da6e070c2cdd2aa6f9fc5cfb14365a1a98ab0f
21+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-15.jsonl.gz 910660095 64db3145b5021e52611f8aedf49bbd0b5f648fef43acc8b1a4481b3dfe96c248
22+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-16.jsonl.gz 878177689 c12de70e57943288511596b5ebbf5c914a5f99e8fb50d74286274021e8a18fb7
23+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-17.jsonl.gz 872805189 2beb6c9f24c650c60354b6b513634e1a209cba28c6f204df4e9e2efc8b7ca59e
24+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-18.jsonl.gz 875275428 2420b73b47cfbb04bca2b1352371dc893879634956b98446bdbde3090556556c
25+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-19.jsonl.gz 862034169 c514885fc1bff8f4e6291813debbc3a9568b538781eb17e273ac9e88b0b16f80
26+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-20.jsonl.gz 887586358 59cd4abad74a38265d8e506afd29e3ea498e2f39fe0ee70e9b733810286b3959
27+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-21.jsonl.gz 890472815 c8d0b1f4cdf78fd658185e92bf1ece16fd16cdde4d27da5221d1a37688ee935e
28+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-22.jsonl.gz 888396337 6e1ca3851f138e75cc0bab36f5cad83db2e6ae126fac7c6fdc4ce71ad8f410ca
29+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-23.jsonl.gz 900331594 d34bd25d0b7b8af8aa27b6b9fad8b7febdca6f0c4c1f5779dfc9b4ccbbec6ed2
30+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-24.jsonl.gz 871216444 40972a44f50c460bcd8fa90a9a0794a2bc169504dc04dbee2a4896c88536f51d
31+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-25.jsonl.gz 871166814 7028865d9a77d8f0b4b06a1291ff75a488578879ba87e9e679b2d68e8e1accd4
32+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-26.jsonl.gz 903385811 e4fd4bdc5c63fa1d1310c0ab573601ca87b3809ce1346fc912b398a6bed7f205
33+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-27.jsonl.gz 842966594 54b8cccea4799351259c3264d077b8df1f291332c0b93f08e66aa78f83a58d18
34+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-28.jsonl.gz 876393409 a8ee205427dcf3be03759d44de276741f855892d76338ca26a72c76bc07cd3c4
35+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-29.jsonl.gz 872982425 cb3c96df23bbb9097b61ce1a524c3eb375165404da72d9f0a51eff9744d75643
36+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-30.jsonl.gz 899739217 e64447543e83b66b725686af6c753f8b08bb6bc9adbe8db36ab31cba11bfcd5b
37+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-31.jsonl.gz 875703668 7f6195da4b45887d56563924a8741d9db64b4cca32cf50c9d07f8836a761ab09
38+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-32.jsonl.gz 895840703 5c6574f0f8a157d585bef31fb79a53b1e1b37fdf638b475c92adbb83812b64db
39+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-33.jsonl.gz 874713497 4d75fd17b0b6ee3133b405b7a90867b0b0b49a51659a5e1eb8bd1d70d0181473
40+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-34.jsonl.gz 872620262 b70c517e40b7283f10b291f44e6a61a9c9f6dacb9de89ae37e2a7e92a96eec01
41+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-35.jsonl.gz 854439473 c6e3615fb8753dd3ffe0890a99793847c99b364b50136c8e0430007023bd5506
42+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-36.jsonl.gz 866233094 dbf6f9227c3558e5195690ace9ec1ccfc84c705eecdd2557d7ead73b88e264ff
43+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-37.jsonl.gz 894411832 bcbf932a71ef07f0217a2620ec395854c2f200e18829c2f28400e52ad9799aaf
44+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-38.jsonl.gz 879967719 6518d41f6a205a4551358a154e16e795a40d4d0cd164fa6556f367a7652e3a0d
45+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-39.jsonl.gz 887056754 f82ba5c7bd19c853e34b2dfdee9c458ef7e9b55f022aed08c3753ebf93034293
46+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-40.jsonl.gz 873720601 9a6a19e4c408858935bd5456d08e155b9418aa2c1e4fe5ea81d227e57bd6517f
47+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-41.jsonl.gz 880452966 c3d3ba79c0f6bb718fa58e473dbc70b2064c8168fc59e3b8ef8df2dbea6bfa37
48+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-42.jsonl.gz 856217171 1d6921d56ff4143e3c189c95e4ab506b70dc569fa4d91f94f9cf29052d253eb6
49+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-43.jsonl.gz 908184635 595a069528f5988b4808821d1dc81bb8c6dfbd672e69f991bd4004b9e1c02736
50+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-44.jsonl.gz 891701874 9a290d4d9c9c9507aeec304e1340a3a02e969f17021f02c969aa90b30a970a0d
51+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-45.jsonl.gz 870559738 40f16e923391fca5f1a30eeacc39ca6c87fc522b9d7b86b7308683ed39c51d5d
52+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-46.jsonl.gz 883791796 0a5425ac0b9800fb492f0199f358846fd63a10a377a80b7ce784fb715a1d5f90
53+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-47.jsonl.gz 882109720 65c230069c85c8c74d1ff562c62c443e69e1e93869ecbdb0a2c673faaf4a184e
54+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-48.jsonl.gz 882241605 df613f0496b7d5f7a49d837b914d1ea80e15c925bb3cf91720ec5b2a25710245
55+
https://storage.googleapis.com/natural_questions/v1.0/train/nq-train-49.jsonl.gz 863247626 ff023c8380d2e9a8c23a1babb24ab6fe2eb5c174f35d74e025bbe0961ea706ec

0 commit comments

Comments
 (0)