@@ -53,17 +53,13 @@ class SquadConfig(tfds.core.BuilderConfig):
53
53
"""BuilderConfig for SQUAD."""
54
54
55
55
@api_utils .disallow_positional_args
56
- def __init__ (self , text_encoder_config = None , ** kwargs ):
56
+ def __init__ (self , ** kwargs ):
57
57
"""BuilderConfig for SQUAD.
58
58
59
59
Args:
60
- text_encoder_config: `tfds.features.text.TextEncoderConfig`, configuration
61
- for the `tfds.features.text.TextEncoder` used for the features feature.
62
60
**kwargs: keyword arguments forwarded to super.
63
61
"""
64
62
super (SquadConfig , self ).__init__ (** kwargs )
65
- self .text_encoder_config = (
66
- text_encoder_config or tfds .features .text .TextEncoderConfig ())
67
63
68
64
69
65
class Squad (tfds .core .GeneratorBasedBuilder ):
@@ -75,51 +71,29 @@ class Squad(tfds.core.GeneratorBasedBuilder):
75
71
BUILDER_CONFIGS = [
76
72
SquadConfig (
77
73
name = "plain_text" ,
78
- version = "0.0.1 " ,
74
+ version = "0.1.0 " ,
79
75
description = "Plain text" ,
80
76
),
81
- SquadConfig (
82
- name = "bytes" ,
83
- version = "0.0.1" ,
84
- description = ("Uses byte-level text encoding with "
85
- "`tfds.features.text.ByteTextEncoder`" ),
86
- text_encoder_config = tfds .features .text .TextEncoderConfig (
87
- encoder = tfds .features .text .ByteTextEncoder ()),
88
- ),
89
- SquadConfig (
90
- name = "subwords8k" ,
91
- version = "0.0.1" ,
92
- description = ("Uses `tfds.features.text.SubwordTextEncoder` with 8k "
93
- "vocab size" ),
94
- text_encoder_config = tfds .features .text .TextEncoderConfig (
95
- encoder_cls = tfds .features .text .SubwordTextEncoder ,
96
- vocab_size = 2 ** 13 ),
97
- ),
98
- SquadConfig (
99
- name = "subwords32k" ,
100
- version = "0.0.2" ,
101
- description = ("Uses `tfds.features.text.SubwordTextEncoder` with "
102
- "32k vocab size" ),
103
- text_encoder_config = tfds .features .text .TextEncoderConfig (
104
- encoder_cls = tfds .features .text .SubwordTextEncoder ,
105
- vocab_size = 2 ** 15 ),
106
- ),
107
77
]
108
78
109
79
def _info (self ):
110
80
return tfds .core .DatasetInfo (
111
81
builder = self ,
112
82
description = _DESCRIPTION ,
113
83
features = tfds .features .FeaturesDict ({
84
+ "id" :
85
+ tf .string ,
86
+ "title" :
87
+ tfds .features .Text (),
114
88
"context" :
115
- tfds .features .Text (
116
- encoder_config = self .builder_config .text_encoder_config ),
89
+ tfds .features .Text (),
117
90
"question" :
118
- tfds .features .Text (
119
- encoder_config = self .builder_config .text_encoder_config ),
120
- "first_answer" :
121
- tfds .features .Text (
122
- encoder_config = self .builder_config .text_encoder_config ),
91
+ tfds .features .Text (),
92
+ "answers" :
93
+ tfds .features .Sequence ({
94
+ "text" : tfds .features .Text (),
95
+ "answer_start" : tf .int32 ,
96
+ }),
123
97
}),
124
98
# No default supervised_keys (as we have to pass both question
125
99
# and context as input).
@@ -128,28 +102,13 @@ def _info(self):
128
102
citation = _CITATION ,
129
103
)
130
104
131
- def _vocab_text_gen (self , filepath ):
132
- for ex in self ._generate_examples (filepath ):
133
- # "first_answer" is a substring of "context" so not need to add it here
134
- yield " " .join ([ex ["question" ], ex ["context" ]])
135
-
136
105
def _split_generators (self , dl_manager ):
137
106
urls_to_download = {
138
107
"train" : os .path .join (self ._URL , self ._TRAINING_FILE ),
139
108
"dev" : os .path .join (self ._URL , self ._DEV_FILE )
140
109
}
141
110
downloaded_files = dl_manager .download_and_extract (urls_to_download )
142
111
143
- # Generate shared vocabulary
144
- # maybe_build_from_corpus uses SubwordTextEncoder if that's configured
145
- self .info .features ["context" ].maybe_build_from_corpus (
146
- self ._vocab_text_gen (downloaded_files ["train" ]))
147
- encoder = self .info .features ["context" ].encoder
148
- # Use maybe_set_encoder because the encoder may have been restored from
149
- # package data.
150
- self .info .features ["question" ].maybe_set_encoder (encoder )
151
- self .info .features ["first_answer" ].maybe_set_encoder (encoder )
152
-
153
112
return [
154
113
tfds .core .SplitGenerator (
155
114
name = tfds .Split .TRAIN ,
@@ -167,10 +126,7 @@ def _generate_examples(self, filepath):
167
126
with tf .io .gfile .GFile (filepath ) as f :
168
127
squad = json .load (f )
169
128
for article in squad ["data" ]:
170
- if "title" in article :
171
- title = article ["title" ].strip ()
172
- else :
173
- title = ""
129
+ title = article .get ("title" , "" ).strip ()
174
130
for paragraph in article ["paragraphs" ]:
175
131
context = paragraph ["context" ].strip ()
176
132
for qa in paragraph ["qas" ]:
@@ -182,17 +138,13 @@ def _generate_examples(self, filepath):
182
138
183
139
# Features currently used are "context", "question", and "answers".
184
140
# Others are extracted here for the ease of future expansions.
185
- example = {
141
+ yield {
186
142
"title" : title ,
187
143
"context" : context ,
188
144
"question" : question ,
189
145
"id" : id_ ,
190
- "answer_starts" : answer_starts ,
191
- "answers" : answers ,
192
- }
193
- yield {
194
- "question" : example ["question" ],
195
- # TODO(b/121176753): return all the answers.
196
- "first_answer" : example ["answers" ][0 ],
197
- "context" : example ["context" ]
146
+ "answers" : {
147
+ "answer_start" : answer_starts ,
148
+ "text" : answers ,
149
+ },
198
150
}
0 commit comments