@@ -2856,6 +2856,8 @@ static size_t llama_get_device_count(const llama_model & model) {
2856
2856
count = ggml_backend_sycl_get_device_count();
2857
2857
#elif defined(GGML_USE_VULKAN)
2858
2858
count = ggml_backend_vk_get_device_count();
2859
+ #elif defined(GGML_USE_KOMPUTE)
2860
+ count = ggml_backend_kompute_get_device_count();
2859
2861
#elif defined(GGML_USE_CANN)
2860
2862
return ggml_backend_cann_get_device_count();
2861
2863
#endif
@@ -2952,6 +2954,11 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
2952
2954
size_t free;
2953
2955
ggml_backend_vk_get_device_memory(device, &free, &total);
2954
2956
return free;
2957
+ #elif defined(GGML_USE_KOMPUTE)
2958
+ size_t total;
2959
+ size_t free;
2960
+ ggml_backend_kompute_get_device_memory(device, &free, &total);
2961
+ return free;
2955
2962
#elif defined(GGML_USE_CANN)
2956
2963
size_t total;
2957
2964
size_t free;
@@ -16899,6 +16906,8 @@ size_t llama_max_devices(void) {
16899
16906
return GGML_SYCL_MAX_DEVICES;
16900
16907
#elif defined(GGML_USE_VULKAN)
16901
16908
return GGML_VK_MAX_DEVICES;
16909
+ #elif defined(GGML_USE_KOMPUTE)
16910
+ return GGML_KOMPUTE_MAX_DEVICES;
16902
16911
#elif defined(GGML_USE_CANN)
16903
16912
return GGML_CANN_MAX_DEVICES;
16904
16913
#else
@@ -17234,13 +17243,35 @@ struct llama_context * llama_new_context_with_model(
17234
17243
}
17235
17244
#elif defined(GGML_USE_KOMPUTE)
17236
17245
if (model->n_gpu_layers > 0) {
17237
- auto * backend = ggml_backend_kompute_init(model->main_gpu);
17238
- if (backend == nullptr) {
17239
- LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
17246
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
17247
+ auto * backend = ggml_backend_kompute_init(model->main_gpu);
17248
+ if (!backend) {
17249
+ LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
17250
+ llama_free(ctx);
17251
+ return nullptr;
17252
+ }
17253
+ ctx->backends.push_back(backend);
17254
+ } else if (model->split_mode == LLAMA_SPLIT_MODE_LAYER) {
17255
+ size_t count = 0;
17256
+ auto * devices =ggml_vk_available_devices(0, &count);
17257
+ for (size_t i = 0; i < count; i++) {
17258
+ LLAMA_LOG_INFO("Kompute: Found device #%d, %s, %s, max-alloc %ld, heap-size %lu\n",
17259
+ devices[i].index, devices[i].vendor, devices[i].name,
17260
+ devices[i].maxAlloc, devices[i].heapSize);
17261
+ auto * backend = ggml_backend_kompute_init(devices[i].index);
17262
+ if (!backend) {
17263
+ LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
17264
+ llama_free(ctx);
17265
+ return nullptr;
17266
+ }
17267
+ ctx->backends.push_back(backend);
17268
+ }
17269
+ free(devices);
17270
+ } else {
17271
+ LLAMA_LOG_ERROR("%s: Failed to init Kompute backend: split mode %d not supported\n", __func__, model->split_mode);
17240
17272
llama_free(ctx);
17241
17273
return nullptr;
17242
17274
}
17243
- ctx->backends.push_back(backend);
17244
17275
}
17245
17276
#elif defined(GGML_USE_CANN)
17246
17277
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
0 commit comments