-
const uint32_t n_embd_k_gqa = 4096;
const uint32_t n_embd_v_gqa = 4096;
const uint32_t kv_size = 4096;
std::vector<struct ggml_context *> ctxs;
int n_layers = 32;
for (int i = 0; i < n_layers; i ++ ) {
struct ggml_init_params params = {
// /*.mem_size =*/ 2u*n_layers*ggml_tensor_overhead() + 4096 * 4096 * GGML_TYPE_F16,
/*.mem_size =*/ 10 * ((size_t)4096 * 4096 * 2 + ggml_tensor_overhead()),
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false,
};
ggml_context * ctx = ggml_init(params);
ctxs.push_back(ctx);
}
for (int i = 0; i < layer_num; ++ i) {
struct ggml_tensor * k = ggml_new_tensor_1d(ctxs[i], GGML_TYPE_F16, n_embd_k_gqa * kv_size);
struct ggml_tensor * v = ggml_new_tensor_1d(ctxs[i], GGML_TYPE_F16, n_embd_k_gqa * kv_size);
k_buffer.push_back(k);
v_buffer.push_back(v);
}
|
Beta Was this translation helpful? Give feedback.
Answered by
walker-ai
Sep 10, 2024
Replies: 1 comment 1 reply
-
These 2 for loop use different max value, |
Beta Was this translation helpful? Give feedback.
1 reply
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I have already fixed that, the reason is the GGML_MAX_CONTEXTS = 64. I set it to 128 and it's done.