Skip to content

Commit b4e5549

Browse files
committed
llama : try loading tensors with pre-computed hashes
Export a function for loading tensor data with pre-computed hash from the RPC backend and use it in the model loader when available
1 parent 00e3e5a commit b4e5549

File tree

5 files changed

+86
-3
lines changed

5 files changed

+86
-3
lines changed

examples/gguf-hash/gguf-hash.cpp

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ typedef enum {
5555

5656
struct hash_params {
5757
std::string input;
58+
bool fnv = false;
5859
bool xxh64 = false;
5960
bool sha1 = false;
6061
bool sha256 = false;
@@ -103,6 +104,7 @@ static void hash_print_usage(const char * executable) {
103104
printf("\n");
104105
printf("options:\n");
105106
printf(" -h, --help show this help message and exit\n");
107+
printf(" --fnv use FNV-1a hash\n");
106108
printf(" --xxh64 use xxh64 hash\n");
107109
printf(" --sha1 use sha1 hash\n");
108110
printf(" --sha256 use sha256 hash\n");
@@ -131,6 +133,11 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par
131133
exit(0);
132134
}
133135

136+
if (arg == "--fnv") {
137+
arg_found = true;
138+
params.fnv = true;
139+
}
140+
134141
if (arg == "--xxh64") {
135142
arg_found = true;
136143
params.xxh64 = true;
@@ -283,6 +290,18 @@ static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char u
283290
uuid[ 8] |= (0x8 << 4);
284291
}
285292

293+
// Computes FNV-1a hash of the data
294+
static uint64_t fnv_hash(const uint8_t * data, size_t len) {
295+
const uint64_t fnv_prime = 0x100000001b3ULL;
296+
uint64_t hash = 0xcbf29ce484222325ULL;
297+
298+
for (size_t i = 0; i < len; ++i) {
299+
hash ^= data[i];
300+
hash *= fnv_prime;
301+
}
302+
return hash;
303+
}
304+
286305
static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
287306
const std::string & fname = hash_params.input;
288307
struct ggml_context * ctx_data = NULL;
@@ -326,7 +345,11 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
326345
SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));
327346
}
328347

348+
struct gguf_context * ctx_out = gguf_init_empty();
329349
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
350+
351+
gguf_set_kv(ctx_out, ctx);
352+
330353
const int n_tensors = gguf_get_n_tensors(ctx);
331354
bool tensor_layer_in_manifest = false;
332355
bool model_in_manifest = false;
@@ -335,10 +358,19 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
335358
for (int i = 0; i < n_tensors; ++i) {
336359
const char * name = gguf_get_tensor_name(ctx, i);
337360
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
361+
gguf_add_tensor(ctx_out, cur);
338362
auto n_bytes = ggml_nbytes(cur);
339363
auto *raw_data = cur->data;
340364
const std::string tensor_layer_name = fname + ":" + name;
341365

366+
if (hash_params.fnv) {
367+
uint64_t hash = fnv_hash((const uint8_t *)raw_data, n_bytes);
368+
printf("%016lx %s\n", hash, tensor_layer_name.c_str());
369+
char hash_key[128];
370+
snprintf(hash_key, sizeof(hash_key), "%s_hash", name);
371+
gguf_set_val_u64(ctx_out, hash_key, hash);
372+
}
373+
342374
if (hash_params.xxh64) {
343375

344376
if (!hash_params.no_layer) {
@@ -580,6 +612,9 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
580612
}
581613
}
582614

615+
auto fname_out = fname + ".rpc";
616+
gguf_write_to_file(ctx_out, fname_out.c_str(), false);
617+
gguf_free(ctx_out);
583618

584619
ggml_free(ctx_data);
585620
gguf_free(ctx);
@@ -663,7 +698,7 @@ int main(int argc, const char ** argv) {
663698

664699
// Autoselect the highest security hash if manifest is provided but
665700
// the user has not specifically defined the hash they care about
666-
if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
701+
if (!params.fnv && !params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
667702
// User has not selected a specific value, pick most secure hash
668703
if (manifest_check.sha256) {
669704
params.sha256 = true;
@@ -680,7 +715,7 @@ int main(int argc, const char ** argv) {
680715
}
681716

682717
// By default if no swich argument provided, assume xxh64
683-
if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
718+
if (!params.fnv && !params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
684719
params.xxh64 = true;
685720
}
686721

ggml/include/ggml-rpc.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, cons
2727
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
2828

2929
GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
30+
GGML_BACKEND_API bool ggml_backend_rpc_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor,
31+
size_t offset, uint64_t hash);
3032

3133
#ifdef __cplusplus
3234
}

ggml/src/ggml-rpc/ggml-rpc.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -597,6 +597,21 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
597597
return response.result;
598598
}
599599

600+
bool ggml_backend_rpc_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, size_t offset, uint64_t hash) {
601+
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
602+
rpc_tensor rpc_tensor = serialize_tensor(tensor);
603+
// input serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes)
604+
size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + sizeof(uint64_t);
605+
std::vector<uint8_t> input(input_size, 0);
606+
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
607+
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
608+
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), &hash, sizeof(hash));
609+
rpc_msg_set_tensor_hash_rsp response;
610+
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, input.data(), input.size(), &response, sizeof(response));
611+
GGML_ASSERT(status);
612+
return response.result;
613+
}
614+
600615
static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
601616
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
602617
rpc_msg_buffer_clear_req request = {ctx->remote_ptr, value};
@@ -1747,6 +1762,8 @@ static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg
17471762
static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
17481763
if (std::strcmp(name, "ggml_backend_rpc_add_device") == 0) {
17491764
return (void *)ggml_backend_rpc_add_device;
1765+
} else if (std::strcmp(name, "ggml_backend_rpc_buffer_load_tensor") == 0) {
1766+
return (void *)ggml_backend_rpc_buffer_load_tensor;
17501767
}
17511768
return NULL;
17521769

src/llama-model-loader.cpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,7 @@ namespace GGUFMeta {
376376
template bool llama_model_loader::get_key<bool> (enum llm_kv kid, bool & result, bool required);
377377
template bool llama_model_loader::get_key<float> (enum llm_kv kid, float & result, bool required);
378378
template bool llama_model_loader::get_key<uint32_t> (enum llm_kv kid, uint32_t & result, bool required);
379+
template bool llama_model_loader::get_key<uint64_t> (enum llm_kv kid, uint64_t & result, bool required);
379380
template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required);
380381

381382
template<>
@@ -688,6 +689,10 @@ llama_model_loader::llama_model_loader(
688689

689690
this->use_mmap = use_mmap;
690691
this->check_tensors = check_tensors;
692+
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
693+
if (rpc_reg) {
694+
ggml_backend_rpc_buffer_load_tensor_fn = (ggml_backend_rpc_buffer_load_tensor_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_buffer_load_tensor");
695+
}
691696
}
692697

693698
std::string llama_model_loader::get_arch_name() const {
@@ -881,6 +886,24 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
881886
}
882887
}
883888

889+
bool llama_model_loader::rpc_load_tensor(struct ggml_tensor * cur) {
890+
if (!ggml_backend_rpc_buffer_load_tensor_fn) {
891+
return false;
892+
}
893+
char hash_key[128];
894+
snprintf(hash_key, sizeof(hash_key), "%s_hash", ggml_get_name(cur));
895+
uint64_t hash_val = 0;
896+
if (!get_key(hash_key, hash_val, false)) {
897+
return false;
898+
}
899+
ggml_backend_buffer_t buf = cur->view_src ? cur->view_src->buffer : cur->buffer;
900+
const char * buf_name = ggml_backend_buffer_name(buf);
901+
if (strncmp(buf_name, "RPC", 3) != 0) {
902+
return false;
903+
}
904+
return ggml_backend_rpc_buffer_load_tensor_fn(buf, cur, 0, hash_val);
905+
}
906+
884907
bool llama_model_loader::load_all_data(
885908
struct ggml_context * ctx,
886909
llama_buf_map & bufs,
@@ -1022,7 +1045,9 @@ bool llama_model_loader::load_all_data(
10221045
mmap_used.first = std::min(mmap_used.first, weight->offs);
10231046
mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
10241047
} else {
1025-
ggml_backend_tensor_set(cur, data, 0, n_size);
1048+
if (!rpc_load_tensor(cur)) {
1049+
ggml_backend_tensor_set(cur, data, 0, n_size);
1050+
}
10261051
}
10271052
} else {
10281053
const auto & file = files.at(weight->idx);

src/llama-model-loader.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,10 @@ struct llama_model_loader {
155155
// for backwards compatibility, does not support ggml-backend
156156
void load_data_for(struct ggml_tensor * cur) const;
157157

158+
typedef bool (*ggml_backend_rpc_buffer_load_tensor_t)(ggml_backend_buffer_t buffer, ggml_tensor * tensor, size_t offset, uint64_t hash);
159+
ggml_backend_rpc_buffer_load_tensor_t ggml_backend_rpc_buffer_load_tensor_fn = nullptr;
160+
bool rpc_load_tensor(struct ggml_tensor * cur);
161+
158162
// Returns false if cancelled by progress_callback
159163
bool load_all_data(
160164
struct ggml_context * ctx,

0 commit comments

Comments
 (0)