llama : try loading tensors with pre-computed hashes

rgerganov · rgerganov · commit b4e554947290 · 2025-04-29T13:04:04.000+03:00
Export a function for loading tensor data with pre-computed hash from
the RPC backend and use it in the model loader when available
diff --git a/examples/gguf-hash/gguf-hash.cpp b/examples/gguf-hash/gguf-hash.cpp
@@ -55,6 +55,7 @@ typedef enum {
 
 struct hash_params {
     std::string input;
+    bool fnv = false;
     bool xxh64 = false;
     bool sha1 = false;
     bool sha256 = false;
@@ -103,6 +104,7 @@ static void hash_print_usage(const char * executable) {
     printf("\n");
     printf("options:\n");
     printf("  -h, --help              show this help message and exit\n");
+    printf("      --fnv               use FNV-1a hash\n");
     printf("      --xxh64             use xxh64 hash\n");
     printf("      --sha1              use sha1 hash\n");
     printf("      --sha256            use sha256 hash\n");
@@ -131,6 +133,11 @@ static void hash_params_parse_ex(int argc, const char ** argv, hash_params & par
             exit(0);
         }
 
+        if (arg == "--fnv") {
+            arg_found = true;
+            params.fnv = true;
+        }
+
         if (arg == "--xxh64") {
             arg_found = true;
             params.xxh64 = true;
@@ -283,6 +290,18 @@ static void generate_uuidv5(const unsigned char sha1_digest[20], unsigned char u
     uuid[ 8] |= (0x8 << 4);
 }
 
+// Computes FNV-1a hash of the data
+static uint64_t fnv_hash(const uint8_t * data, size_t len) {
+    const uint64_t fnv_prime = 0x100000001b3ULL;
+    uint64_t hash = 0xcbf29ce484222325ULL;
+
+    for (size_t i = 0; i < len; ++i) {
+        hash ^= data[i];
+        hash *= fnv_prime;
+    }
+    return hash;
+}
+
 static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
     const std::string & fname = hash_params.input;
     struct ggml_context * ctx_data = NULL;
@@ -326,7 +345,11 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
         SHA1Update( &sha1_for_uuid_ctx, (unsigned char const *)uuidv5_namespace, sizeof(uuidv5_namespace));
     }
 
+    struct gguf_context * ctx_out = gguf_init_empty();
     struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+
+    gguf_set_kv(ctx_out, ctx);
+
     const int n_tensors = gguf_get_n_tensors(ctx);
     bool tensor_layer_in_manifest = false;
     bool model_in_manifest = false;
@@ -335,10 +358,19 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
     for (int i = 0; i < n_tensors; ++i) {
         const char * name = gguf_get_tensor_name(ctx, i);
         struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
+        gguf_add_tensor(ctx_out, cur);
         auto n_bytes = ggml_nbytes(cur);
         auto *raw_data = cur->data;
         const std::string tensor_layer_name = fname + ":" + name;
 
+        if (hash_params.fnv) {
+            uint64_t hash = fnv_hash((const uint8_t *)raw_data, n_bytes);
+            printf("%016lx  %s\n", hash, tensor_layer_name.c_str());
+            char hash_key[128];
+            snprintf(hash_key, sizeof(hash_key), "%s_hash", name);
+            gguf_set_val_u64(ctx_out, hash_key, hash);
+        }
+
         if (hash_params.xxh64) {
 
             if (!hash_params.no_layer) {
@@ -580,6 +612,9 @@ static hash_exit_code_t gguf_hash(const hash_params & hash_params) {
         }
     }
 
+    auto fname_out = fname + ".rpc";
+    gguf_write_to_file(ctx_out, fname_out.c_str(), false);
+    gguf_free(ctx_out);
 
     ggml_free(ctx_data);
     gguf_free(ctx);
@@ -663,7 +698,7 @@ int main(int argc, const char ** argv) {
 
         // Autoselect the highest security hash if manifest is provided but
         // the user has not specifically defined the hash they care about
-        if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
+        if (!params.fnv && !params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
             // User has not selected a specific value, pick most secure hash
             if (manifest_check.sha256) {
                 params.sha256 = true;
@@ -680,7 +715,7 @@ int main(int argc, const char ** argv) {
     }
 
     // By default if no swich argument provided, assume xxh64
-    if (!params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
+    if (!params.fnv && !params.xxh64 && !params.sha1 && !params.uuid && !params.sha256) {
         params.xxh64 = true;
     }
 
diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h
@@ -27,6 +27,8 @@ GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, cons
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
 
 GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
+GGML_BACKEND_API bool ggml_backend_rpc_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor,
+                                                          size_t offset, uint64_t hash);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -597,6 +597,21 @@ static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
     return response.result;
 }
 
+bool ggml_backend_rpc_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, size_t offset, uint64_t hash) {
+    ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
+    rpc_tensor rpc_tensor = serialize_tensor(tensor);
+    // input serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes)
+    size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + sizeof(uint64_t);
+    std::vector<uint8_t> input(input_size, 0);
+    memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
+    memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
+    memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), &hash, sizeof(hash));
+    rpc_msg_set_tensor_hash_rsp response;
+    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, input.data(), input.size(), &response, sizeof(response));
+    GGML_ASSERT(status);
+    return response.result;
+}
+
 static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
     rpc_msg_buffer_clear_req request = {ctx->remote_ptr, value};
@@ -1747,6 +1762,8 @@ static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg
 static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
     if (std::strcmp(name, "ggml_backend_rpc_add_device") == 0) {
         return (void *)ggml_backend_rpc_add_device;
+    } else if (std::strcmp(name, "ggml_backend_rpc_buffer_load_tensor") == 0) {
+        return (void *)ggml_backend_rpc_buffer_load_tensor;
     }
     return NULL;
 
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -376,6 +376,7 @@ namespace GGUFMeta {
     template bool llama_model_loader::get_key<bool>       (enum llm_kv kid, bool & result,        bool required);
     template bool llama_model_loader::get_key<float>      (enum llm_kv kid, float & result,       bool required);
     template bool llama_model_loader::get_key<uint32_t>   (enum llm_kv kid, uint32_t & result,    bool required);
+    template bool llama_model_loader::get_key<uint64_t>   (enum llm_kv kid, uint64_t & result,    bool required);
     template bool llama_model_loader::get_key<std::string>(enum llm_kv kid, std::string & result, bool required);
 
     template<>
@@ -688,6 +689,10 @@ llama_model_loader::llama_model_loader(
 
     this->use_mmap = use_mmap;
     this->check_tensors = check_tensors;
+    ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+    if (rpc_reg) {
+        ggml_backend_rpc_buffer_load_tensor_fn = (ggml_backend_rpc_buffer_load_tensor_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_buffer_load_tensor");
+    }
 }
 
 std::string llama_model_loader::get_arch_name() const {
@@ -881,6 +886,24 @@ void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
     }
 }
 
+bool llama_model_loader::rpc_load_tensor(struct ggml_tensor * cur) {
+    if (!ggml_backend_rpc_buffer_load_tensor_fn) {
+        return false;
+    }
+    char hash_key[128];
+    snprintf(hash_key, sizeof(hash_key), "%s_hash", ggml_get_name(cur));
+    uint64_t hash_val = 0;
+    if (!get_key(hash_key, hash_val, false)) {
+        return false;
+    }
+    ggml_backend_buffer_t buf = cur->view_src ? cur->view_src->buffer : cur->buffer;
+    const char * buf_name = ggml_backend_buffer_name(buf);
+    if (strncmp(buf_name, "RPC", 3) != 0) {
+        return false;
+    }
+    return ggml_backend_rpc_buffer_load_tensor_fn(buf, cur, 0, hash_val);
+}
+
 bool llama_model_loader::load_all_data(
         struct ggml_context * ctx,
         llama_buf_map & bufs,
@@ -1022,7 +1045,9 @@ bool llama_model_loader::load_all_data(
                 mmap_used.first  = std::min(mmap_used.first,  weight->offs);
                 mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
             } else {
-                ggml_backend_tensor_set(cur, data, 0, n_size);
+                if (!rpc_load_tensor(cur)) {
+                    ggml_backend_tensor_set(cur, data, 0, n_size);
+                }
             }
         } else {
             const auto & file = files.at(weight->idx);
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
@@ -155,6 +155,10 @@ struct llama_model_loader {
     // for backwards compatibility, does not support ggml-backend
     void load_data_for(struct ggml_tensor * cur) const;
 
+    typedef bool (*ggml_backend_rpc_buffer_load_tensor_t)(ggml_backend_buffer_t buffer, ggml_tensor * tensor, size_t offset, uint64_t hash);
+    ggml_backend_rpc_buffer_load_tensor_t ggml_backend_rpc_buffer_load_tensor_fn = nullptr;
+    bool rpc_load_tensor(struct ggml_tensor * cur);
+
     // Returns false if cancelled by progress_callback
     bool load_all_data(
             struct ggml_context * ctx,

Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,8 @@ GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, cons`
`27`	`27`	`GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);`
`28`	`28`
`29`	`29`	`GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);`
	`30`	`+GGML_BACKEND_API bool ggml_backend_rpc_buffer_load_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor,`
	`31`	`+ size_t offset, uint64_t hash);`
`30`	`32`
`31`	`33`	`#ifdef __cplusplus`
`32`	`34`	`}`