Skip to content

Commit 21ccd64

Browse files
committed
llama : use vectors and avoid has_cache
ggml-ci
1 parent 9964cd0 commit 21ccd64

File tree

1 file changed

+26
-17
lines changed

1 file changed

+26
-17
lines changed

llama.cpp

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2163,11 +2163,9 @@ struct llama_vocab {
21632163
std::unordered_map<token, id> token_to_id;
21642164
std::vector<token_data> id_to_token;
21652165

2166-
bool has_cache = false;
2167-
2168-
std::vector<id> cache_special_tokens;
2169-
std::unordered_map<id, token> cache_token_to_piece; // llama_token_to_piece(special = false);
2170-
std::unordered_map<id, token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
2166+
std::vector<id> cache_special_tokens;
2167+
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
2168+
std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
21712169

21722170
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
21732171

@@ -4852,12 +4850,18 @@ static void llm_load_vocab(
48524850
}
48534851

48544852
// build token to piece caches
4855-
for (llama_token id = 0; id < (llama_token) n_vocab; ++id) {
4856-
vocab.cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
4857-
vocab.cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
4858-
}
4853+
{
4854+
std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
4855+
std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
48594856

4860-
vocab.has_cache = true;
4857+
for (uint32_t id = 0; id < n_vocab; ++id) {
4858+
cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
4859+
cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
4860+
}
4861+
4862+
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
4863+
std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
4864+
}
48614865
}
48624866

48634867
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
@@ -14417,7 +14421,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
1441714421

1441814422
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
1441914423
candidates_decoded.reserve(candidates->size);
14420-
std::vector<llama_grammar_candidate> candidates_grammar;
14424+
14425+
std::vector<llama_grammar_candidate> candidates_grammar;
1442114426
candidates_grammar.reserve(candidates->size);
1442214427

1442314428
for (size_t i = 0; i < candidates->size; ++i) {
@@ -18305,14 +18310,18 @@ static std::string llama_decode_text(const std::string & text) {
1830518310

1830618311
// does not write null-terminator to buf
1830718312
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
18308-
if (model->vocab.has_cache) {
18313+
// if we have a cache - use it
18314+
{
1830918315
const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
18310-
const auto & res = cache.at(token);
18311-
if (length < (int) res.size()) {
18312-
return -(int) res.size();
18316+
18317+
if (!cache.empty()) {
18318+
const auto & res = cache.at(token);
18319+
if (length < (int) res.size()) {
18320+
return -(int) res.size();
18321+
}
18322+
memcpy(buf, res.c_str(), res.size());
18323+
return res.size();
1831318324
}
18314-
memcpy(buf, res.c_str(), res.size());
18315-
return res.size();
1831618325
}
1831718326

1831818327
if (0 <= token && token < llama_n_vocab(model)) {

0 commit comments

Comments
 (0)