Skip to content

llama : add ability to load model from memory buffer #9125

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -2284,7 +2284,7 @@ extern "C" {

GGML_API struct gguf_context * gguf_init_empty(void);
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
//GGML_API struct gguf_context * gguf_init_from_buffer(..);
GGML_API struct gguf_context * gguf_init_from_buffer(const char * buffer, size_t size, struct gguf_init_params params);

GGML_API void gguf_free(struct gguf_context * ctx);

Expand Down
131 changes: 78 additions & 53 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -20801,6 +20801,13 @@ struct gguf_context {
void * data;
};

struct gguf_src {
FILE * file;
// for reading gguf from a buffer instead of a file
const char * buffer;
size_t size;
};

static size_t gguf_type_size(enum gguf_type type) {
GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
return GGUF_TYPE_SIZE[type];
Expand All @@ -20820,19 +20827,25 @@ static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]);
}

static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
const size_t n = fread(dst, 1, size, file);
static bool gguf_fread_el(struct gguf_src * src, void * dst, size_t size, size_t * offset) {
size_t n;
if (src->file) {
n = fread(dst, 1, size, src->file);
} else {
n = MIN(src->size - *offset, size);
memcpy(dst, src->buffer + *offset, n);
}
*offset += n;
return n == size;
}

static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
static bool gguf_fread_str(struct gguf_src * src, struct gguf_str * p, size_t * offset) {
p->n = 0;
p->data = NULL;

bool ok = true;

ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
ok = ok && gguf_fread_el(src, &p->n, sizeof(p->n), offset);

// early exit if string length is invalid, prevents from integer overflow
if (p->n == SIZE_MAX) {
Expand All @@ -20842,7 +20855,7 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {

p->data = GGML_CALLOC(p->n + 1, 1);

ok = ok && gguf_fread_el(file, p->data, p->n, offset);
ok = ok && gguf_fread_el(src, p->data, p->n, offset);

return ok;
}
Expand Down Expand Up @@ -20893,26 +20906,20 @@ struct gguf_context * gguf_init_empty(void) {
return ctx;
}

struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
FILE * file = ggml_fopen(fname, "rb");
if (!file) {
fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
return NULL;
}

static struct gguf_context * gguf_init_internal(struct gguf_src * src, struct gguf_init_params params) {
// offset from start of file
size_t offset = 0;

char magic[4];

// check the magic before making allocations
{
gguf_fread_el(file, &magic, sizeof(magic), &offset);
gguf_fread_el(src, &magic, sizeof(magic), &offset);

for (uint32_t i = 0; i < sizeof(magic); i++) {
if (magic[i] != GGUF_MAGIC[i]) {
fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
fclose(file);
if (src->file) fclose(src->file);
return NULL;
}
}
Expand All @@ -20930,13 +20937,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
ctx->infos = NULL;
ctx->data = NULL;

ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
ok = ok && gguf_fread_el(src, &ctx->header.version, sizeof(ctx->header.version), &offset);
ok = ok && gguf_fread_el(src, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
ok = ok && gguf_fread_el(src, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);

if (ctx->header.version == 1) {
fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
fclose(file);
if (src->file) fclose(src->file);
gguf_free(ctx);
return NULL;
}
Expand All @@ -20949,7 +20956,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

if (!ok) {
fprintf(stderr, "%s: failed to read header\n", __func__);
fclose(file);
if (src->file) fclose(src->file);
gguf_free(ctx);
return NULL;
}
Expand All @@ -20968,28 +20975,28 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

//fprintf(stderr, "%s: reading kv %d\n", __func__, i);

ok = ok && gguf_fread_str(file, &kv->key, &offset);
ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
ok = ok && gguf_fread_str(src, &kv->key, &offset);
ok = ok && gguf_fread_el (src, &kv->type, sizeof(kv->type), &offset);

//fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);

switch (kv->type) {
case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (src, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (src, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (src, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (src, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (src, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (src, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (src, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (src, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (src, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (src, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (src, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(src, &kv->value.str, &offset); break;
case GGUF_TYPE_ARRAY:
{
ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
ok = ok && gguf_fread_el(src, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
ok = ok && gguf_fread_el(src, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);

switch (kv->value.arr.type) {
case GGUF_TYPE_UINT8:
Expand All @@ -21007,29 +21014,29 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// prevent from integer overflow in the malloc below
if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
fclose(file);
if (src->file) fclose(src->file);
gguf_free(ctx);
return NULL;
}

kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, gguf_type_size(kv->value.arr.type));

ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
ok = ok && gguf_fread_el(src, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
} break;
case GGUF_TYPE_STRING:
{
// prevent from integer overflow in the malloc below
if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
fclose(file);
if (src->file) fclose(src->file);
gguf_free(ctx);
return NULL;
}

kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, sizeof(struct gguf_str));

for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
ok = ok && gguf_fread_str(src, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
}
} break;
case GGUF_TYPE_ARRAY:
Expand All @@ -21048,7 +21055,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

if (!ok) {
fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
fclose(file);
if (src->file) fclose(src->file);
gguf_free(ctx);
return NULL;
}
Expand All @@ -21065,17 +21072,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
info->ne[j] = 1;
}

ok = ok && gguf_fread_str(file, &info->name, &offset);
ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
ok = ok && gguf_fread_str(src, &info->name, &offset);
ok = ok && gguf_fread_el (src, &info->n_dims, sizeof(info->n_dims), &offset);

ok = ok && (info->n_dims <= GGML_MAX_DIMS);

for (uint32_t j = 0; j < info->n_dims; ++j) {
ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
ok = ok && gguf_fread_el(src, &info->ne[j], sizeof(info->ne[j]), &offset);
}

ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
ok = ok && gguf_fread_el (src, &info->type, sizeof(info->type), &offset);
ok = ok && gguf_fread_el (src, &info->offset, sizeof(info->offset), &offset);

// TODO: return an error instead of crashing with GGML_ASSERT
gguf_tensor_info_sanitize(info);
Expand All @@ -21090,7 +21097,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

if (!ok) {
fprintf(stderr, "%s: failed to read tensor info\n", __func__);
fclose(file);
if (src->file) fclose(src->file);
gguf_free(ctx);
return NULL;
}
Expand All @@ -21110,7 +21117,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

if (offset_pad != 0) {
offset += ctx->alignment - offset_pad;
fseek(file, offset, SEEK_SET);
if (src->file) fseek(src->file, offset, SEEK_SET);
}
}

Expand All @@ -21132,7 +21139,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) {
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
__func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
fclose(file);
if (src->file) fclose(src->file);
gguf_free(ctx);
return NULL;
}
Expand Down Expand Up @@ -21164,7 +21171,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
*params.ctx = ggml_init(pdata);
if (*params.ctx == NULL) {
fprintf(stderr, "%s: failed to initialize context\n", __func__);
fclose(file);
if (src->file) fclose(src->file);
gguf_free(ctx);
return NULL;
}
Expand All @@ -21179,11 +21186,11 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
ok = ok && data != NULL;

// read the binary blob with the tensor data
ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
ok = ok && gguf_fread_el(src, data->data, ctx->size, &offset);

if (!ok) {
fprintf(stderr, "%s: failed to read tensor data\n", __func__);
fclose(file);
if (src->file) fclose(src->file);
ggml_free(ctx_data);
gguf_free(ctx);
return NULL;
Expand Down Expand Up @@ -21222,7 +21229,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

if (!ok) {
fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
fclose(file);
if (src->file) fclose(src->file);
ggml_free(ctx_data);
gguf_free(ctx);
return NULL;
Expand All @@ -21231,11 +21238,29 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
ggml_set_no_alloc(ctx_data, params.no_alloc);
}

fclose(file);
if (src->file) fclose(src->file);

return ctx;
}

struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
struct gguf_src src = {ggml_fopen(fname, "rb"), NULL, 0};
if (!src.file) {
fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
return NULL;
}
return gguf_init_internal(&src, params);
}

struct gguf_context * gguf_init_from_buffer(const char * buffer, size_t size, struct gguf_init_params params) {
if (!buffer) {
fprintf(stderr, "%s: buffer cannot be null\n", __func__);
return NULL;
}
struct gguf_src src = {NULL, buffer, size};
return gguf_init_internal(&src, params);
}

void gguf_free(struct gguf_context * ctx) {
if (ctx == NULL) {
return;
Expand Down
13 changes: 12 additions & 1 deletion include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,12 @@ extern "C" {
// lora adapter
struct llama_lora_adapter;

// to be used by llama_load_model_from_buffers
struct llama_model_shard_buffer {
const char * data;
size_t size;
};

// Helpers for getting default parameters
LLAMA_API struct llama_model_params llama_model_default_params(void);
LLAMA_API struct llama_context_params llama_context_default_params(void);
Expand All @@ -433,7 +439,12 @@ extern "C" {

LLAMA_API struct llama_model * llama_load_model_from_file(
const char * path_model,
struct llama_model_params params);
struct llama_model_params params);

LLAMA_API struct llama_model * llama_load_model_from_buffers(
struct llama_model_shard_buffer * shards,
size_t n_shards,
struct llama_model_params params);

LLAMA_API void llama_free_model(struct llama_model * model);

Expand Down
Loading
Loading