Skip to content

Commit 01d418b

Browse files
committed
Loading refactor that tries to satisfy everyone
Features: - Support all three formats (ggml, ggmf, ggjt). (However, I didn't include the hack needed to support GPT4All files without conversion. Those can still be used after converting them with convert.py from my other PR.) - Support both mmap and read (mmap is used by default, but can be disabled with `--no-mmap`, and is automatically disabled for pre-ggjt files or on platforms where mmap is not supported). - Support multi-file models like before, but automatically determine the number of parts rather than requiring `--n_parts`. - Improve validation and error checking. - Stop using the per-file type field (f16) entirely in favor of just relying on the per-tensor type/size fields (which improves flexibility, and will make it easier to support the new GPTQ-for-LLaMa models in the future). - Support VirtualLock on Windows (using the same `--mlock` option as on Unix). - madvise/PrefetchVirtualMemory support (based on ggml-org#740) - Quantization now produces a single-file output even with multi-file inputs (not really a feature as much as 'it was easier this way'). Issues: - I switched from fopen/fread instead of ifstream, both to avoid the need to open the same file again to mmap it, and because I thought would be optimized to skip the buffer for large reads... XXX - VirtualLock does not work at all on the one Windows VM I tested it on (it complains about quota). Todo: figure out why. - Need to verify that fread actually is fast. - However, it doesn't work when I test it on my VM? Todo: Figure out why. Implementation notes: I tried to across several functions to make it easier to modify/refactor the code in the future. Regarding code style: I tried to follow the code style, but I'm naughty and used a few advanced C++ features repeatedly: - Destructors to make it easier to ensure everything gets cleaned up. - Exceptions. I don't even usually use exceptions when writing C++, and I can remove them if desired... but here they make the loading code much more succinct while still properly handling a variety of errors, ranging from API calls failing to integer overflow and allocation failure. Co-authored-by: Pavol Rusnak <pavol@rusnak.io> (for the bit I copied from ggml-org#740)
1 parent eeaa7b0 commit 01d418b

File tree

10 files changed

+1169
-814
lines changed

10 files changed

+1169
-814
lines changed

examples/common.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
#include "common.h"
22

3-
#include "ggml.h"
4-
53
#include <cassert>
64
#include <cstring>
75
#include <fstream>
@@ -154,6 +152,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
154152
params.use_color = true;
155153
} else if (arg == "--mlock") {
156154
params.use_mlock = true;
155+
} else if (arg == "--no-mmap") {
156+
params.use_mmap = false;
157157
} else if (arg == "--mtest") {
158158
params.mem_test = true;
159159
} else if (arg == "--verbose-prompt") {
@@ -233,9 +233,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
233233
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
234234
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
235235
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
236-
if (ggml_mlock_supported()) {
236+
if (llama_mlock_supported()) {
237237
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
238238
}
239+
if (llama_mmap_supported()) {
240+
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
241+
}
239242
fprintf(stderr, " --mtest compute maximum memory usage\n");
240243
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
241244
fprintf(stderr, " -m FNAME, --model FNAME\n");

examples/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ struct gpt_params {
4747
bool instruct = false; // instruction mode (used for Alpaca models)
4848
bool ignore_eos = false; // do not stop generating after eos
4949
bool perplexity = false; // compute perplexity over the prompt
50+
bool use_mmap = true; // use mmap for faster loads
5051
bool use_mlock = false; // use mlock to keep model in memory
5152
bool mem_test = false; // compute maximum memory usage
5253
bool verbose_prompt = false; // print prompt tokens before generation

examples/embedding/embedding.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ int main(int argc, char ** argv) {
3838
lparams.seed = params.seed;
3939
lparams.f16_kv = params.memory_f16;
4040
lparams.logits_all = params.perplexity;
41+
lparams.use_mmap = params.use_mmap;
4142
lparams.use_mlock = params.use_mlock;
4243
lparams.embedding = params.embedding;
4344

examples/main/main.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ int main(int argc, char ** argv) {
9797
lparams.n_parts = params.n_parts;
9898
lparams.seed = params.seed;
9999
lparams.f16_kv = params.memory_f16;
100+
lparams.use_mmap = params.use_mmap;
100101
lparams.use_mlock = params.use_mlock;
101102

102103
ctx = llama_init_from_file(params.model.c_str(), lparams);

examples/perplexity/perplexity.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ int main(int argc, char ** argv) {
115115
lparams.seed = params.seed;
116116
lparams.f16_kv = params.memory_f16;
117117
lparams.logits_all = params.perplexity;
118+
lparams.use_mmap = params.use_mmap;
118119
lparams.use_mlock = params.use_mlock;
119120
lparams.embedding = params.embedding;
120121

ggml.c

Lines changed: 0 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -97,17 +97,6 @@ typedef void* thread_ret_t;
9797
#define static_assert(cond, msg) _Static_assert(cond, msg)
9898
#endif
9999

100-
#define GGML_MLOCK_SUPPORT 0
101-
102-
#ifdef __has_include
103-
#if __has_include(<sys/mman.h>)
104-
#undef GGML_MLOCK_SUPPORT
105-
#define GGML_MLOCK_SUPPORT 1
106-
#include <sys/mman.h>
107-
#endif
108-
#endif
109-
110-
111100
/*#define GGML_PERF*/
112101
#define GGML_DEBUG 0
113102
#define GGML_GELU_FP16
@@ -2690,21 +2679,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
26902679

26912680
static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35");
26922681

2693-
//
2694-
// ggml object
2695-
//
2696-
2697-
struct ggml_object {
2698-
size_t offs;
2699-
size_t size;
2700-
2701-
struct ggml_object * next;
2702-
2703-
char padding[8];
2704-
};
2705-
2706-
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
2707-
27082682
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
27092683
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
27102684

@@ -2716,7 +2690,6 @@ struct ggml_context {
27162690
size_t mem_size;
27172691
void * mem_buffer;
27182692
bool mem_buffer_owned;
2719-
bool mem_buffer_mlocked;
27202693
bool no_alloc;
27212694

27222695
int n_objects;
@@ -3003,7 +2976,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
30032976
/*.mem_size =*/ params.mem_size,
30042977
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
30052978
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
3006-
/*.mem_buffer_mlocked =*/ false,
30072979
/*.no_alloc =*/ params.no_alloc,
30082980
/*.n_objects =*/ 0,
30092981
/*.objects_begin =*/ NULL,
@@ -3036,14 +3008,6 @@ void ggml_free(struct ggml_context * ctx) {
30363008
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
30373009
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
30383010

3039-
#if GGML_MLOCK_SUPPORT
3040-
if (ctx->mem_buffer_mlocked) {
3041-
if (munlock(ctx->mem_buffer, ctx->mem_size)) {
3042-
fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno));
3043-
}
3044-
}
3045-
#endif
3046-
30473011
if (ctx->mem_buffer_owned) {
30483012
free(ctx->mem_buffer);
30493013
}
@@ -3072,48 +3036,6 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
30723036
return result;
30733037
}
30743038

3075-
#ifdef __APPLE__
3076-
#define MLOCK_SUGGESTION \
3077-
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
3078-
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
3079-
#else
3080-
#define MLOCK_SUGGESTION \
3081-
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
3082-
#endif
3083-
3084-
bool ggml_mlock_supported(void) {
3085-
return GGML_MLOCK_SUPPORT;
3086-
}
3087-
3088-
bool ggml_mlock(
3089-
struct ggml_context * ctx,
3090-
const void *opt_extra_addr,
3091-
size_t opt_extra_len,
3092-
char **err_p) {
3093-
// TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32
3094-
#if GGML_MLOCK_SUPPORT
3095-
if (ctx->mem_buffer_mlocked) {
3096-
return true;
3097-
}
3098-
if (mlock(ctx->mem_buffer, ctx->mem_size) ||
3099-
(opt_extra_len &&
3100-
mlock(opt_extra_addr, opt_extra_len))) {
3101-
if ((*err_p = malloc(1024))) {
3102-
snprintf(*err_p, 1024,
3103-
"failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
3104-
ctx->mem_size + opt_extra_len,
3105-
strerror(errno));
3106-
}
3107-
return false;
3108-
}
3109-
ctx->mem_buffer_mlocked = true;
3110-
return true;
3111-
#else // GGML_MLOCK_SUPPORT
3112-
*err_p = strdup("can't mlock because it's not supported on this system");
3113-
return false;
3114-
#endif // GGML_MLOCK_SUPPORT
3115-
}
3116-
31173039
////////////////////////////////////////////////////////////////////////////////
31183040

31193041
struct ggml_tensor * ggml_new_tensor_impl(

ggml.h

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,19 @@ enum ggml_op {
253253
GGML_OP_COUNT,
254254
};
255255

256+
257+
// ggml object
258+
struct ggml_object {
259+
size_t offs;
260+
size_t size;
261+
262+
struct ggml_object * next;
263+
264+
char padding[8];
265+
};
266+
267+
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
268+
256269
// n-dimensional tensor
257270
struct ggml_tensor {
258271
enum ggml_type type;
@@ -344,13 +357,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
344357

345358
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
346359

347-
bool ggml_mlock_supported(void);
348-
bool ggml_mlock(
349-
struct ggml_context * ctx,
350-
const void *opt_extra_addr,
351-
size_t opt_extra_len,
352-
char **err_p);
353-
354360
struct ggml_tensor * ggml_new_tensor(
355361
struct ggml_context * ctx,
356362
enum ggml_type type,

0 commit comments

Comments
 (0)