Skip to content

Commit 2b9c9f5

Browse files
CISCMinh141120
authored andcommitted
llama : improve sep token handling (ggml-org#14272)
1 parent d3c5e6d commit 2b9c9f5

File tree

3 files changed

+64
-74
lines changed

3 files changed

+64
-74
lines changed

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,7 @@ struct common_params {
359359
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
360360
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
361361
std::string embd_sep = "\n"; // separator of embeddings
362+
std::string cls_sep = "\t"; // separator of classification sequences
362363

363364
// server params
364365
int32_t port = 8080; // server listens on this network port

convert_hf_to_gguf.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5000,8 +5000,6 @@ def set_vocab(self):
50005000
self.gguf_writer.add_token_type_count(2)
50015001
else:
50025002
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
5003-
self.gguf_writer.add_add_bos_token(True)
5004-
self.gguf_writer.add_add_eos_token(True)
50055003

50065004

50075005
@ModelBase.register("OpenELMForCausalLM")

gguf-py/gguf/vocab.py

Lines changed: 63 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
154154
else:
155155
added_tokens = {}
156156
tokenizer_config = None
157+
tokenizer_config = None
157158
tokenizer_config_file = path / 'tokenizer_config.json'
158159
if tokenizer_config_file.is_file():
159160
with open(tokenizer_config_file, encoding = 'utf-8') as f:
@@ -167,81 +168,71 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
167168
tokenizer_config['bos_token'] = special_bos = special_cls
168169
if not special_eos and special_sep and tokenizer_config:
169170
tokenizer_config['eos_token'] = special_eos = special_sep
170-
if post_processor := tokenizer.get('post_processor'):
171-
for processor in post_processor.get('processors', [post_processor]):
172-
if processor.get('type') == 'RobertaProcessing':
173-
self.add_special_token['bos'] = True
174-
self.add_special_token['eos'] = True
175-
self.add_special_token['sep'] = True
176-
if not special_cls and tokenizer_config:
177-
special_cls = processor.get('cls', [special_bos])[0]
178-
tokenizer_config['cls_token'] = special_cls
179-
if not special_sep and tokenizer_config:
180-
special_sep = processor.get('sep', [special_eos])[0]
181-
tokenizer_config['sep_token'] = special_sep
182-
continue
183-
# Crude parsing of TemplateProcessing to determine if BOS/SEP/EOS should be added
184-
# Only works with simple templates, **will** get it wrong on unusual sequences
185-
if processor.get('type') == 'TemplateProcessing':
186-
tmpl_single = processor.get('single', [])
187-
tmpl_pair = processor.get('pair', [])
188-
special_first = None
189-
special_last = None
190-
if len(tmpl_single) > 1:
191-
if special_first := tmpl_single[0].get('SpecialToken', {}).get('id'):
192-
if not tokenizer_config:
193-
special_bos = special_first
194-
self.add_special_token['bos'] = True if special_first in (special_bos, special_cls) else False
195-
if special_first not in (special_bos, special_cls):
196-
logger.warning(f'Unknown leading special token {special_first!r} in TemplateProcessing<single>')
197-
if special_last := tmpl_single[-1].get('SpecialToken', {}).get('id'):
198-
if not tokenizer_config:
199-
special_eos = special_last
200-
elif special_last != special_eos:
201-
if 'eot' not in self.special_token_types:
202-
self.special_token_types = tuple(self.special_token_types) + ('eot', )
203-
tokenizer_config['eot_token'] = special_eos
204-
elif 'eom' not in self.special_token_types:
205-
self.special_token_types = tuple(self.special_token_types) + ('eom', )
206-
tokenizer_config['eom_token'] = special_eos
207-
else:
208-
logger.warning(f'Overriding EOS token {special_eos!r} with {special_last!r} without EOT/EOM fallback!')
209-
tokenizer_config['eos_token'] = special_eos = special_last
210-
self.add_special_token['eos'] = True if special_last == special_eos else False
211-
if special_last != special_eos:
212-
logger.warning(f'Unknown trailing special token {special_last!r} in TemplateProcessing<single>')
213-
if tmpl_pair:
214-
seq_start = 1 if special_first and tmpl_pair[0].get('SpecialToken', {}).get('id') == special_first else 0
215-
seq_stop = -1 if special_last and tmpl_pair[-1].get('SpecialToken', {}).get('id') == special_last else None
216-
if (special_first and seq_start == 0) or (special_last and seq_stop is None):
217-
logger.warning('TemplateProcessing<single> leading/trailing special tokens do not match TemplateProcessing<pair>')
218-
if tmpl_pair := tmpl_pair[slice(seq_start, seq_stop)]:
219-
tmpl_a = tmpl_pair[0].get('Sequence', {}).get('id')
220-
tmpl_b = tmpl_pair[-1].get('Sequence', {}).get('id')
221-
if tmpl_a != 'A' or tmpl_b != 'B':
222-
logger.warning(f'Unknown sequence {tmpl_a}...{tmpl_b} in TemplateProcessing<pair>')
223-
# A [sep] [eos] B
224-
if tmpl_a == 'A' and tmpl_b == 'B' and (tmpl_pair := tmpl_pair[1:-1]):
225-
add_sep = False
226-
if special_entry := tmpl_pair[0].get('SpecialToken', {}).get('id'):
227-
if special_entry in (special_sep, special_eos) and not special_last:
171+
post_processor = tokenizer.get('post_processor', {})
172+
for processor in post_processor.get('processors', [post_processor]):
173+
if processor.get('type') == 'RobertaProcessing':
174+
self.add_special_token['bos'] = True
175+
self.add_special_token['eos'] = True
176+
self.add_special_token['sep'] = True
177+
if not special_cls and tokenizer_config:
178+
special_cls = processor.get('cls', [special_bos])[0]
179+
tokenizer_config['cls_token'] = special_cls
180+
if not special_sep and tokenizer_config:
181+
special_sep = processor.get('sep', [special_eos])[0]
182+
tokenizer_config['sep_token'] = special_sep
183+
continue
184+
# Crude parsing of TemplateProcessing to determine if BOS/SEP/EOS should be added
185+
# Only works with simple templates, **will** get it wrong on unusual sequences
186+
if processor.get('type') == 'TemplateProcessing':
187+
tmpl_single = processor.get('single', [])
188+
tmpl_pair = processor.get('pair', [])
189+
special_first = None
190+
special_last = None
191+
if len(tmpl_single) > 1:
192+
if special_first := tmpl_single[0].get('SpecialToken', {}).get('id'):
193+
if not tokenizer_config:
194+
special_bos = special_first
195+
self.add_special_token['bos'] = True if special_first in (special_bos, special_cls) else False
196+
if special_first not in (special_bos, special_cls):
197+
logger.warning(f'Unknown leading special token {special_first!r} in TemplateProcessing<single>')
198+
if special_last := tmpl_single[-1].get('SpecialToken', {}).get('id'):
199+
if not tokenizer_config:
200+
special_eos = special_last
201+
self.add_special_token['eos'] = True if special_last == special_eos else False
202+
if special_last != special_eos:
203+
logger.warning(f'Unknown trailing special token {special_last!r} in TemplateProcessing<single>')
204+
if tmpl_pair:
205+
seq_start = 1 if tmpl_pair[0].get('SpecialToken', {}).get('id') == special_first else 0
206+
seq_stop = -1 if tmpl_pair[-1].get('SpecialToken', {}).get('id') == special_last else None
207+
if seq_start == 0 or seq_stop is None:
208+
logger.warning('TemplateProcessing<single> leading/trailing special tokens do not match TemplateProcessing<pair>')
209+
if tmpl_pair := tmpl_pair[slice(seq_start, seq_stop)]:
210+
tmpl_a = tmpl_pair[0].get('Sequence', {}).get('id')
211+
tmpl_b = tmpl_pair[-1].get('Sequence', {}).get('id')
212+
if tmpl_a != 'A' or tmpl_b != 'B':
213+
logger.warning(f'Unknown sequence {tmpl_a}...{tmpl_b} in TemplateProcessing<pair>')
214+
# A [sep] [eos] B
215+
if tmpl_a == 'A' and tmpl_b == 'B' and (tmpl_pair := tmpl_pair[1:-1]):
216+
add_sep = False
217+
if special_entry := tmpl_pair[0].get('SpecialToken', {}).get('id'):
218+
if special_entry in (special_sep, special_eos) and not special_last:
219+
add_sep = True
220+
if special_entry not in (special_sep, special_eos):
221+
logger.warning(f'Unknown separator token {special_entry!r} in TemplateProcessing<pair>')
222+
else:
223+
logger.warning(f'Unknown middle sequence {tmpl_pair[0]!r} in TemplateProcessing<pair>')
224+
if len(tmpl_pair) == 2:
225+
if special_entry := tmpl_pair[1].get('SpecialToken', {}).get('id'):
226+
if special_entry in (special_sep, special_eos):
228227
add_sep = True
229228
if special_entry not in (special_sep, special_eos):
230-
logger.warning(f'Unknown separator token {special_entry!r} in TemplateProcessing<pair>')
229+
logger.warning(f'Unknown second separator token {special_entry!r} in TemplateProcessing<pair>')
231230
else:
232-
logger.warning(f'Unknown middle sequence {tmpl_pair[0]!r} in TemplateProcessing<pair>')
233-
if len(tmpl_pair) == 2:
234-
if special_entry := tmpl_pair[1].get('SpecialToken', {}).get('id'):
235-
if special_entry in (special_sep, special_eos):
236-
add_sep = True
237-
if special_entry not in (special_sep, special_eos):
238-
logger.warning(f'Unknown second separator token {special_entry!r} in TemplateProcessing<pair>')
239-
else:
240-
logger.warning(f'Unknown second middle sequence {tmpl_pair[1]!r} in TemplateProcessing<pair>')
241-
self.add_special_token['sep'] = add_sep
242-
if add_sep and not special_sep and tokenizer_config:
243-
tokenizer_config['sep_token'] = special_eos
244-
continue
231+
logger.warning(f'Unknown second middle sequence {tmpl_pair[1]!r} in TemplateProcessing<pair>')
232+
self.add_special_token['sep'] = add_sep
233+
if add_sep and not special_sep and tokenizer_config:
234+
tokenizer_config['sep_token'] = special_eos
235+
continue
245236
if not tokenizer_config:
246237
return True
247238
chat_template_alt = None

0 commit comments

Comments
 (0)