Skip to content

Commit 2086691

Browse files
committed
llama/kompute: Add multi-GPU support
Signed-off-by: Feng Jiang <jiangfeng@kylinos.cn>
1 parent 20b664f commit 2086691

File tree

1 file changed

+35
-4
lines changed

1 file changed

+35
-4
lines changed

src/llama.cpp

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2856,6 +2856,8 @@ static size_t llama_get_device_count(const llama_model & model) {
28562856
count = ggml_backend_sycl_get_device_count();
28572857
#elif defined(GGML_USE_VULKAN)
28582858
count = ggml_backend_vk_get_device_count();
2859+
#elif defined(GGML_USE_KOMPUTE)
2860+
count = ggml_backend_kompute_get_device_count();
28592861
#elif defined(GGML_USE_CANN)
28602862
return ggml_backend_cann_get_device_count();
28612863
#endif
@@ -2952,6 +2954,11 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
29522954
size_t free;
29532955
ggml_backend_vk_get_device_memory(device, &free, &total);
29542956
return free;
2957+
#elif defined(GGML_USE_KOMPUTE)
2958+
size_t total;
2959+
size_t free;
2960+
ggml_backend_kompute_get_device_memory(device, &free, &total);
2961+
return free;
29552962
#elif defined(GGML_USE_CANN)
29562963
size_t total;
29572964
size_t free;
@@ -16899,6 +16906,8 @@ size_t llama_max_devices(void) {
1689916906
return GGML_SYCL_MAX_DEVICES;
1690016907
#elif defined(GGML_USE_VULKAN)
1690116908
return GGML_VK_MAX_DEVICES;
16909+
#elif defined(GGML_USE_KOMPUTE)
16910+
return GGML_KOMPUTE_MAX_DEVICES;
1690216911
#elif defined(GGML_USE_CANN)
1690316912
return GGML_CANN_MAX_DEVICES;
1690416913
#else
@@ -17234,13 +17243,35 @@ struct llama_context * llama_new_context_with_model(
1723417243
}
1723517244
#elif defined(GGML_USE_KOMPUTE)
1723617245
if (model->n_gpu_layers > 0) {
17237-
auto * backend = ggml_backend_kompute_init(model->main_gpu);
17238-
if (backend == nullptr) {
17239-
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
17246+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
17247+
auto * backend = ggml_backend_kompute_init(model->main_gpu);
17248+
if (!backend) {
17249+
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
17250+
llama_free(ctx);
17251+
return nullptr;
17252+
}
17253+
ctx->backends.push_back(backend);
17254+
} else if (model->split_mode == LLAMA_SPLIT_MODE_LAYER) {
17255+
size_t count = 0;
17256+
auto * devices =ggml_vk_available_devices(0, &count);
17257+
for (size_t i = 0; i < count; i++) {
17258+
LLAMA_LOG_INFO("Kompute: Found device #%d, %s, %s, max-alloc %ld, heap-size %lu\n",
17259+
devices[i].index, devices[i].vendor, devices[i].name,
17260+
devices[i].maxAlloc, devices[i].heapSize);
17261+
auto * backend = ggml_backend_kompute_init(devices[i].index);
17262+
if (!backend) {
17263+
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
17264+
llama_free(ctx);
17265+
return nullptr;
17266+
}
17267+
ctx->backends.push_back(backend);
17268+
}
17269+
free(devices);
17270+
} else {
17271+
LLAMA_LOG_ERROR("%s: Failed to init Kompute backend: split mode %d not supported\n", __func__, model->split_mode);
1724017272
llama_free(ctx);
1724117273
return nullptr;
1724217274
}
17243-
ctx->backends.push_back(backend);
1724417275
}
1724517276
#elif defined(GGML_USE_CANN)
1724617277
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used

0 commit comments

Comments
 (0)