@@ -230,7 +230,9 @@ def __init__(self,
230
230
oov_buckets = 1 ,
231
231
oov_token = "UNK" ,
232
232
lowercase = False ,
233
- tokenizer = None ):
233
+ tokenizer = None ,
234
+ strip_vocab = True ,
235
+ decode_token_separator = " " ):
234
236
"""Constructs a TokenTextEncoder.
235
237
236
238
To load from a file saved with `TokenTextEncoder.save_to_file`, use
@@ -244,8 +246,14 @@ def __init__(self,
244
246
lowercase: `bool`, whether to make all text and tokens lowercase.
245
247
tokenizer: `Tokenizer`, responsible for converting incoming text into a
246
248
list of tokens.
249
+ strip_vocab: `bool`, whether to strip whitespace from the beginning and
250
+ end of elements of `vocab_list`.
251
+ decode_token_separator: `str`, the string used to separate tokens when
252
+ decoding.
247
253
"""
248
- self ._vocab_list = [tf .compat .as_text (el ).strip () for el in vocab_list ]
254
+ self ._vocab_list = [tf .compat .as_text (el ) for el in vocab_list ]
255
+ if strip_vocab :
256
+ self ._vocab_list = [el .strip () for el in self ._vocab_list ]
249
257
self ._lowercase = lowercase
250
258
if self ._lowercase :
251
259
self ._vocab_list = [t .lower () for t in self ._vocab_list ]
@@ -261,6 +269,8 @@ def __init__(self,
261
269
self ._tokenizer = (tokenizer or Tokenizer (reserved_tokens = reserved_tokens ))
262
270
self ._user_defined_tokenizer = tokenizer
263
271
272
+ self ._decode_token_separator = decode_token_separator
273
+
264
274
def encode (self , s ):
265
275
s = tf .compat .as_text (s )
266
276
if self .lowercase :
@@ -286,7 +296,7 @@ def decode(self, ids):
286
296
tokens .append (self ._vocab_list [int_id ])
287
297
else :
288
298
tokens .append (self ._oov_token )
289
- return " " .join (tokens )
299
+ return self . _decode_token_separator .join (tokens )
290
300
291
301
@property
292
302
def vocab_size (self ):
0 commit comments