Skip to content

Commit afdb669

Browse files
committed
Merge branch 'master' into compilade/mamba2
2 parents 830e554 + bf2a99e commit afdb669

File tree

99 files changed

+5182
-4240
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

99 files changed

+5182
-4240
lines changed

.github/workflows/build.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -683,7 +683,7 @@ jobs:
683683
env:
684684
OPENBLAS_VERSION: 0.3.23
685685
SDE_VERSION: 9.33.0-2024-01-07
686-
VULKAN_VERSION: 1.4.309.0
686+
VULKAN_VERSION: 1.4.313.2
687687

688688
strategy:
689689
matrix:
@@ -736,7 +736,7 @@ jobs:
736736
id: get_vulkan
737737
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
738738
run: |
739-
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
739+
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
740740
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
741741
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
742742
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"

.github/workflows/release.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ jobs:
302302

303303
env:
304304
OPENBLAS_VERSION: 0.3.23
305-
VULKAN_VERSION: 1.4.309.0
305+
VULKAN_VERSION: 1.4.313.2
306306

307307
strategy:
308308
matrix:
@@ -332,7 +332,7 @@ jobs:
332332
id: get_vulkan
333333
if: ${{ matrix.backend == 'vulkan' }}
334334
run: |
335-
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
335+
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
336336
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
337337
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
338338
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"

ci/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -779,7 +779,7 @@ function gg_run_rerank_tiny {
779779
model_f16="${path_models}/ggml-model-f16.gguf"
780780

781781
# for this model, the SEP token is "</s>"
782-
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
782+
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
783783

784784
# sample output
785785
# rerank score 0: 0.029

common/arg.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2706,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
27062706
params.embd_sep = value;
27072707
}
27082708
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2709+
add_opt(common_arg(
2710+
{"--cls-separator"}, "STRING",
2711+
"separator of classification sequences (default \\t) for example \"<#seq#>\"",
2712+
[](common_params & params, const std::string & value) {
2713+
params.cls_sep = value;
2714+
}
2715+
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
27092716
add_opt(common_arg(
27102717
{"--host"}, "HOST",
27112718
string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
@@ -3210,6 +3217,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
32103217
params.speculative.model.path = value;
32113218
}
32123219
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3220+
add_opt(common_arg(
3221+
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
3222+
string_format(
3223+
"KV cache data type for K for the draft model\n"
3224+
"allowed values: %s\n"
3225+
"(default: %s)",
3226+
get_all_kv_cache_types().c_str(),
3227+
ggml_type_name(params.speculative.cache_type_k)
3228+
),
3229+
[](common_params & params, const std::string & value) {
3230+
params.speculative.cache_type_k = kv_cache_type_from_str(value);
3231+
}
3232+
).set_env("LLAMA_ARG_CACHE_TYPE_K_DRAFT"));
3233+
add_opt(common_arg(
3234+
{"-ctvd", "--cache-type-v-draft"}, "TYPE",
3235+
string_format(
3236+
"KV cache data type for V for the draft model\n"
3237+
"allowed values: %s\n"
3238+
"(default: %s)",
3239+
get_all_kv_cache_types().c_str(),
3240+
ggml_type_name(params.speculative.cache_type_v)
3241+
),
3242+
[](common_params & params, const std::string & value) {
3243+
params.speculative.cache_type_v = kv_cache_type_from_str(value);
3244+
}
3245+
).set_env("LLAMA_ARG_CACHE_TYPE_V_DRAFT"));
32133246

32143247
add_opt(common_arg(
32153248
{"-mv", "--model-vocoder"}, "FNAME",

common/common.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -706,11 +706,17 @@ bool fs_validate_filename(const std::string & filename) {
706706
// disable C++17 deprecation warning for std::codecvt_utf8
707707
# pragma clang diagnostic push
708708
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
709+
#elif defined(__GNUC__)
710+
# pragma GCC diagnostic push
711+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
709712
#endif
713+
710714
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
711715

712716
#if defined(__clang__)
713717
# pragma clang diagnostic pop
718+
#elif defined(__GNUC__)
719+
# pragma GCC diagnostic pop
714720
#endif
715721

716722
filename_utf32 = converter.from_bytes(filename);
@@ -1284,6 +1290,9 @@ std::vector<llama_token> common_tokenize(
12841290
int n_tokens = text.length() + 2 * add_special;
12851291
std::vector<llama_token> result(n_tokens);
12861292
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1293+
if (n_tokens == std::numeric_limits<int32_t>::min()) {
1294+
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
1295+
}
12871296
if (n_tokens < 0) {
12881297
result.resize(-n_tokens);
12891298
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);

common/common.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,9 @@ struct common_params_speculative {
199199
float p_split = 0.1f; // speculative decoding split probability
200200
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
201201

202+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
203+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
204+
202205
struct cpu_params cpuparams;
203206
struct cpu_params cpuparams_batch;
204207

@@ -355,6 +358,7 @@ struct common_params {
355358
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
356359
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
357360
std::string embd_sep = "\n"; // separator of embeddings
361+
std::string cls_sep = "\t"; // separator of classification sequences
358362

359363
// server params
360364
int32_t port = 8080; // server listens on this network port

common/json-schema-to-grammar.cpp

Lines changed: 3 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -41,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
4141
return result;
4242
}
4343

44-
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
45-
class string_view {
46-
const std::string & _str;
47-
const size_t _start;
48-
const size_t _end;
49-
public:
50-
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
51-
52-
size_t size() const {
53-
return _end - _start;
54-
}
55-
56-
size_t length() const {
57-
return size();
58-
}
59-
60-
operator std::string() const {
61-
return str();
62-
}
63-
64-
std::string str() const {
65-
return _str.substr(_start, _end - _start);
66-
}
67-
68-
string_view substr(size_t pos, size_t len = std::string::npos) const {
69-
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
70-
}
71-
72-
char operator[](size_t pos) const {
73-
auto index = _start + pos;
74-
if (index >= _end) {
75-
throw std::out_of_range("string_view index out of range");
76-
}
77-
return _str[_start + pos];
78-
}
79-
80-
bool operator==(const string_view & other) const {
81-
std::string this_str = *this;
82-
std::string other_str = other;
83-
return this_str == other_str;
84-
}
85-
};
86-
8744
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
8845
auto has_min = min_value != std::numeric_limits<int>::min();
8946
auto has_max = max_value != std::numeric_limits<int>::max();
@@ -112,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
11269
}
11370
out << "}";
11471
};
115-
std::function<void(const string_view &, const string_view &)> uniform_range =
116-
[&](const string_view & from, const string_view & to) {
72+
std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
73+
[&](const std::string_view & from, const std::string_view & to) {
11774
size_t i = 0;
11875
while (i < from.length() && i < to.length() && from[i] == to[i]) {
11976
i++;
12077
}
12178
if (i > 0) {
122-
out << "\"" << from.substr(0, i).str() << "\"";
79+
out << "\"" << from.substr(0, i) << "\"";
12380
}
12481
if (i < from.length() && i < to.length()) {
12582
if (i > 0) {

convert_hf_to_gguf.py

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2145,7 +2145,6 @@ def __init__(self, *args, **kwargs):
21452145

21462146
def set_vocab(self):
21472147
self._set_vocab_gpt2()
2148-
self.gguf_writer.add_add_bos_token(True)
21492148

21502149
def set_gguf_parameters(self):
21512150
super().set_gguf_parameters()
@@ -2194,7 +2193,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
21942193
name += ".weight"
21952194
if "multi_modal_projector.linear_1" in name:
21962195
# despite the name with number postfix, this is a single fully connected layer
2197-
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
2196+
return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch)]
21982197
return [(self.map_tensor_name(name), data_torch)]
21992198
return []
22002199

@@ -3918,9 +3917,6 @@ def _xlmroberta_set_vocab(self) -> None:
39183917
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
39193918
special_vocab.add_to_gguf(self.gguf_writer)
39203919

3921-
self.gguf_writer.add_add_bos_token(True)
3922-
self.gguf_writer.add_add_eos_token(True)
3923-
39243920

39253921
@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
39263922
class DistilBertModel(BertModel):
@@ -3962,8 +3958,6 @@ def set_vocab(self):
39623958
bpe_tok_path = self.dir_model / "tokenizer.json"
39633959
if bpe_tok_path.exists():
39643960
self._set_vocab_gpt2()
3965-
self.gguf_writer.add_add_bos_token(True)
3966-
self.gguf_writer.add_add_eos_token(True)
39673961

39683962
# we need this to validate the size of the token_type embeddings
39693963
# though currently we are passing all zeros to the token_type embeddings
@@ -4950,8 +4944,6 @@ def set_vocab(self):
49504944
self.gguf_writer.add_token_type_count(2)
49514945
else:
49524946
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
4953-
self.gguf_writer.add_add_bos_token(True)
4954-
self.gguf_writer.add_add_eos_token(True)
49554947

49564948

49574949
@ModelBase.register("OpenELMForCausalLM")
@@ -5553,9 +5545,6 @@ def set_vocab(self):
55535545
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
55545546
special_vocab.add_to_gguf(self.gguf_writer)
55555547

5556-
self.gguf_writer.add_add_bos_token(False)
5557-
self.gguf_writer.add_add_eos_token(True)
5558-
55595548
def set_gguf_parameters(self):
55605549
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
55615550
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -5693,9 +5682,6 @@ def set_vocab(self):
56935682
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
56945683
special_vocab.add_to_gguf(self.gguf_writer)
56955684

5696-
self.gguf_writer.add_add_bos_token(False)
5697-
self.gguf_writer.add_add_eos_token(True)
5698-
56995685
def set_gguf_parameters(self):
57005686
if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
57015687
logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -6491,8 +6477,8 @@ def parse_args() -> argparse.Namespace:
64916477
help="model is executed on big endian machine",
64926478
)
64936479
parser.add_argument(
6494-
"model", type=Path,
6495-
help="directory containing model file",
6480+
"model", type=str,
6481+
help="directory containing model file or huggingface repository ID (if --remote)",
64966482
nargs="?",
64976483
)
64986484
parser.add_argument(
@@ -6603,18 +6589,20 @@ def main() -> None:
66036589
else:
66046590
logging.basicConfig(level=logging.INFO)
66056591

6606-
dir_model = args.model
6607-
66086592
if args.remote:
6593+
hf_repo_id = args.model
66096594
from huggingface_hub import snapshot_download
66106595
local_dir = snapshot_download(
6611-
repo_id=str(dir_model),
6596+
repo_id=hf_repo_id,
66126597
allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
66136598
dir_model = Path(local_dir)
66146599
logger.info(f"Downloaded config and tokenizer to {local_dir}")
6600+
else:
6601+
hf_repo_id = None
6602+
dir_model = Path(args.model)
66156603

66166604
if not dir_model.is_dir():
6617-
logger.error(f'Error: {args.model} is not a directory')
6605+
logger.error(f'Error: {dir_model} is not a directory')
66186606
sys.exit(1)
66196607

66206608
ftype_map: dict[str, gguf.LlamaFileType] = {
@@ -6634,9 +6622,9 @@ def main() -> None:
66346622

66356623
if args.outfile is not None:
66366624
fname_out = args.outfile
6637-
elif args.remote:
6625+
elif hf_repo_id:
66386626
# if remote, use the model ID as the output file name
6639-
fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf")
6627+
fname_out = Path("./" + hf_repo_id.replace("/", "-") + "-{ftype}.gguf")
66406628
else:
66416629
fname_out = dir_model
66426630

@@ -6665,7 +6653,7 @@ def main() -> None:
66656653
split_max_tensors=args.split_max_tensors,
66666654
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
66676655
small_first_shard=args.no_tensor_first_split,
6668-
remote_hf_model_id=str(args.model) if args.remote else None)
6656+
remote_hf_model_id=hf_repo_id)
66696657

66706658
if args.vocab_only:
66716659
logger.info("Exporting model vocab...")

docs/build.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Build llama.cpp locally
22

3-
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
3+
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](../include/llama.h).
44

55
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server.
66

examples/embedding/embedding.cpp

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -133,10 +133,36 @@ int main(int argc, char ** argv) {
133133
// max batch size
134134
const uint64_t n_batch = params.n_batch;
135135

136+
// get added sep and eos token, if any
137+
const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
138+
const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
139+
136140
// tokenize the prompts and trim
137141
std::vector<std::vector<int32_t>> inputs;
138142
for (const auto & prompt : prompts) {
139-
auto inp = common_tokenize(ctx, prompt, true, true);
143+
std::vector<llama_token> inp;
144+
145+
// split classification pairs and insert expected separator tokens
146+
if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
147+
std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
148+
std::string final_prompt;
149+
150+
for (size_t i = 0; i < pairs.size(); i++) {
151+
final_prompt += pairs[i];
152+
if (i != pairs.size() - 1) {
153+
if (!added_eos_token.empty()) {
154+
final_prompt += added_eos_token;
155+
}
156+
if (!added_sep_token.empty()) {
157+
final_prompt += added_sep_token;
158+
}
159+
}
160+
}
161+
162+
inp = common_tokenize(ctx, final_prompt, true, true);
163+
} else {
164+
inp = common_tokenize(ctx, prompt, true, true);
165+
}
140166
if (inp.size() > n_batch) {
141167
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
142168
__func__, (long long int) inp.size(), (long long int) n_batch);
@@ -145,11 +171,11 @@ int main(int argc, char ** argv) {
145171
inputs.push_back(inp);
146172
}
147173

148-
// check if the last token is SEP
174+
// check if the last token is SEP/EOS
149175
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
150176
for (auto & inp : inputs) {
151-
if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) {
152-
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
177+
if (inp.empty() || (inp.back() != llama_vocab_sep(vocab) && inp.back() != llama_vocab_eos(vocab))) {
178+
LOG_WRN("%s: last token in the prompt is not SEP or EOS\n", __func__);
153179
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
154180
}
155181
}

0 commit comments

Comments
 (0)