From 3666c861d45eabb4408b2f0d1ff8f16b2dfd4a96 Mon Sep 17 00:00:00 2001
From: Ming Xie <xieming@kylinos.cn>
Date: Wed, 21 Aug 2024 10:54:09 +0800
Subject: [PATCH 01/10] ggml/kompute: Rename ggml_kompute_context to
 ggml_backend_kompute_context

Signed-off-by: Ming Xie <xieming@kylinos.cn>
---
 ggml/src/ggml-kompute.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index 41ac63fa48e0f..6f03c5e7954bb 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -66,12 +66,12 @@ static std::string ggml_kompute_format_name(int device) {
     return "Kompute" + std::to_string(device);
 }
 
-struct ggml_kompute_context {
+struct ggml_backend_kompute_context {
     int device;
     std::string name;
     std::shared_ptr<vk::DescriptorPool> pool;
 
-    ggml_kompute_context(int device)
+    ggml_backend_kompute_context(int device)
         : device(device), name(ggml_kompute_format_name(device)) {}
 };
 
@@ -79,7 +79,7 @@ struct ggml_kompute_context {
 // and consolidate the init functions and simplify object lifetime management. As it currently stands,
 // we *have* to have the kompute manager no matter what for device discovery, but the kompute context
 // is only created when a device is set and vulkan is explicitly turned on.
-static ggml_kompute_context *s_kompute_context = nullptr;
+static ggml_backend_kompute_context *s_kompute_context = nullptr;
 
 class kompute_manager {
     kp::Manager *s_mgr = nullptr;
@@ -348,7 +348,7 @@ ggml_vk_device ggml_vk_current_device() {
 }
 
 static
-void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) {
+void ggml_vk_allocate_descriptor_pool(struct ggml_backend_kompute_context * ctx, size_t size) {
     std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
         vk::DescriptorPoolSize(
           vk::DescriptorType::eStorageBuffer,
@@ -370,7 +370,7 @@ void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t
 }
 
 static
-void ggml_vk_free_descriptor_pool(struct ggml_kompute_context * ctx) {
+void ggml_vk_free_descriptor_pool(struct ggml_backend_kompute_context * ctx) {
     if (ctx->pool) {
         komputeManager()->device()->destroy(
           *ctx->pool,
@@ -1412,7 +1412,7 @@ static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
     return false;
 }
 
-static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
+static void ggml_vk_graph_compute(struct ggml_backend_kompute_context * ctx, struct ggml_cgraph * gf) {
     const int n_seq = 8;
 
     // FIXME: Figure out if we can somehow optimize the size of the pool... right now we're setting
@@ -1935,12 +1935,12 @@ ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
 // backend
 
 static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
-    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
+    auto * ctx = static_cast<ggml_backend_kompute_context *>(backend->context);
     return ctx->name.c_str();
 }
 
 static void ggml_backend_kompute_free(ggml_backend_t backend) {
-    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
+    auto * ctx = static_cast<ggml_backend_kompute_context *>(backend->context);
 
     assert(ctx == s_kompute_context);
     s_kompute_context = nullptr;
@@ -1952,12 +1952,12 @@ static void ggml_backend_kompute_free(ggml_backend_t backend) {
 }
 
 static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
-    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
+    auto * ctx = static_cast<ggml_backend_kompute_context *>(backend->context);
     return ggml_backend_kompute_buffer_type(ctx->device);
 }
 
 static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
+    auto * ctx = static_cast<ggml_backend_kompute_context *>(backend->context);
     ggml_vk_graph_compute(ctx, cgraph);
     return GGML_STATUS_SUCCESS;
 }
@@ -2002,7 +2002,7 @@ static ggml_guid_t ggml_backend_kompute_guid() {
 
 ggml_backend_t ggml_backend_kompute_init(int device) {
     GGML_ASSERT(s_kompute_context == nullptr);
-    s_kompute_context = new ggml_kompute_context(device);
+    s_kompute_context = new ggml_backend_kompute_context(device);
 
     ggml_backend_t kompute_backend = new ggml_backend {
         /* .guid      = */ ggml_backend_kompute_guid(),

From e914ac7c688c118eca6352af05d8443fcc6b714d Mon Sep 17 00:00:00 2001
From: Ming Xie <xieming@kylinos.cn>
Date: Wed, 21 Aug 2024 11:18:35 +0800
Subject: [PATCH 02/10] ggml/kompute: Introducing struct
 ggml_backend_kompute_buffer_context

Signed-off-by: Ming Xie <xieming@kylinos.cn>
---
 ggml/src/ggml-kompute.cpp | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index 6f03c5e7954bb..399cb932b9954 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -112,6 +112,10 @@ struct ggml_vk_memory {
     vk::Buffer *stagingBuffer = nullptr;
 };
 
+struct ggml_backend_kompute_buffer_context {
+    struct ggml_vk_memory memory;
+};
+
 #ifdef __linux__
 __attribute__((constructor))
 static void enable_sam() {
@@ -1826,15 +1830,16 @@ static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t b
 }
 
 static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    auto * memory = (ggml_vk_memory *)buffer->context;
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_context *>(buffer->context);
     if (ggml_vk_has_device()) {
-        ggml_vk_free_memory(*memory);
+        ggml_vk_free_memory(ctx->memory);
     }
-    delete memory;
+    delete ctx;
 }
 
 static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return ((ggml_vk_memory *)buffer->context)->data;
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_context *>(buffer->context);
+    return ctx->memory.data;
 }
 
 static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -1860,11 +1865,11 @@ static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer,
 }
 
 static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    auto * memory = (ggml_vk_memory *)buffer->context;
-    memset(memory->data, value, buffer->size);
+    auto * ctx = static_cast<ggml_backend_kompute_buffer_context *>(buffer->context);
+    memset(ctx->memory.data, value, ctx->memory.size);
 
-    if (memory->stagingBuffer)
-        komputeManager()->sequence()->eval<kp::OpBufferSyncDevice>(memory->primaryBuffer, memory->stagingBuffer, memory->size);
+    if (ctx->memory.stagingBuffer)
+        komputeManager()->sequence()->eval<kp::OpBufferSyncDevice>(ctx->memory.primaryBuffer, ctx->memory.stagingBuffer, ctx->memory.size);
 }
 
 static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
@@ -1888,7 +1893,8 @@ static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffe
 
 static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     ggml_backend_kompute_device_ref(buft);
-    auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size));
+    auto * ctx = new ggml_backend_kompute_buffer_context;
+    ctx->memory = ggml_vk_allocate(size);
     return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size);
 }
 

From 74ba8516ce3977b078d657aef50c24a02561fa46 Mon Sep 17 00:00:00 2001
From: Weishi Li <liweishi@kylinos.cn>
Date: Wed, 21 Aug 2024 14:26:51 +0800
Subject: [PATCH 03/10] ggml/kompute: Move butf into struct
 ggml_backend_kompute_context

Signed-off-by: Weishi Li <liweishi@kylinos.cn>
---
 ggml/src/ggml-kompute.cpp | 43 +++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index 399cb932b9954..6839cb5835053 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -71,8 +71,10 @@ struct ggml_backend_kompute_context {
     std::string name;
     std::shared_ptr<vk::DescriptorPool> pool;
 
+    ggml_backend_buffer_type buft;
+
     ggml_backend_kompute_context(int device)
-        : device(device), name(ggml_kompute_format_name(device)) {}
+        : device(device), name(ggml_kompute_format_name(device)) { buft.context = nullptr; }
 };
 
 // FIXME: It would be good to consolidate the kompute manager and the kompute context into one object
@@ -1918,24 +1920,25 @@ static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
 };
 
 ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
-    static std::vector<ggml_backend_buffer_type> bufts = []() {
-        std::vector<ggml_backend_buffer_type> vec;
-        auto devices = ggml_vk_available_devices_internal(0);
-        vec.reserve(devices.size());
+    if (!s_kompute_context)
+	    s_kompute_context = new ggml_backend_kompute_context(device);
 
-        for (const auto & dev : devices) {
-            vec.push_back({
-                /* .iface   = */ ggml_backend_kompute_buffer_type_interface,
-                /* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment, dev.maxAlloc)
-            });
+    auto * buft = &s_kompute_context->buft;
+    if (!buft->context) {
+        auto devices = ggml_vk_available_devices_internal(0);
+        for (std::size_t i = 0; i < devices.size(); i++) {
+            if (device == devices[i].index) {
+                buft->context = new ggml_backend_kompute_buffer_type_context(
+                        devices[i].index,
+                        devices[i].bufferAlignment,
+                        devices[i].maxAlloc);
+                buft->iface = ggml_backend_kompute_buffer_type_interface;
+                break;
+            }
         }
-        return vec;
-    }();
+    }
 
-    auto it = std::find_if(bufts.begin(), bufts.end(), [device](const ggml_backend_buffer_type & t) {
-        return device == static_cast<ggml_backend_kompute_buffer_type_context *>(t.context)->device;
-    });
-    return it < bufts.end() ? &*it : nullptr;
+    return buft;
 }
 
 // backend
@@ -1974,8 +1977,8 @@ static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struc
 }
 
 static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(backend);
-    return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
+    auto *ctx = static_cast<ggml_backend_kompute_context *>(backend->context);
+    return &ctx->buft == buft;
 }
 
 static struct ggml_backend_i kompute_backend_i = {
@@ -2007,8 +2010,8 @@ static ggml_guid_t ggml_backend_kompute_guid() {
 }
 
 ggml_backend_t ggml_backend_kompute_init(int device) {
-    GGML_ASSERT(s_kompute_context == nullptr);
-    s_kompute_context = new ggml_backend_kompute_context(device);
+    if (!s_kompute_context)
+	    s_kompute_context = new ggml_backend_kompute_context(device);
 
     ggml_backend_t kompute_backend = new ggml_backend {
         /* .guid      = */ ggml_backend_kompute_guid(),

From d94ad56f87f777d175f47b1346d5a76d9e3d9128 Mon Sep 17 00:00:00 2001
From: Weishi Li <liweishi@kylinos.cn>
Date: Wed, 21 Aug 2024 14:54:13 +0800
Subject: [PATCH 04/10] ggml/kompute: Use the kp::Manager in
 ggml_backend_kompute_context instead of global

Signed-off-by: Weishi Li <liweishi@kylinos.cn>
---
 ggml/include/ggml-kompute.h |   2 -
 ggml/src/ggml-kompute.cpp   | 371 +++++++++++++++++++-----------------
 2 files changed, 196 insertions(+), 177 deletions(-)

diff --git a/ggml/include/ggml-kompute.h b/ggml/include/ggml-kompute.h
index 171465456a5b1..b90143c070356 100644
--- a/ggml/include/ggml-kompute.h
+++ b/ggml/include/ggml-kompute.h
@@ -25,8 +25,6 @@ struct ggml_vk_device {
 struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
 bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
 bool ggml_vk_has_vulkan(void);
-bool ggml_vk_has_device(void);
-struct ggml_vk_device ggml_vk_current_device(void);
 
 //
 // backend API
diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index 6839cb5835053..911827dbdb76e 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -69,6 +69,8 @@ static std::string ggml_kompute_format_name(int device) {
 struct ggml_backend_kompute_context {
     int device;
     std::string name;
+
+    kp::Manager manager;
     std::shared_ptr<vk::DescriptorPool> pool;
 
     ggml_backend_buffer_type buft;
@@ -339,16 +341,16 @@ bool ggml_vk_has_vulkan() {
     return komputeManager()->hasVulkan();
 }
 
-bool ggml_vk_has_device() {
-    return komputeManager()->hasDevice();
+static bool ggml_vk_has_device(struct ggml_backend_kompute_context *ctx) {
+    return ctx->manager.hasDevice();
 }
 
-ggml_vk_device ggml_vk_current_device() {
-    if (!komputeManager()->hasDevice())
+static ggml_vk_device ggml_vk_current_device(struct ggml_backend_kompute_context *ctx) {
+    if (!ctx->manager.hasDevice())
         return ggml_vk_device();
 
     auto devices = ggml_vk_available_devices_internal(0);
-    ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName.data());
+    ggml_vk_filterByName(devices, ctx->manager.physicalDevice()->getProperties().deviceName.data());
     GGML_ASSERT(!devices.empty());
     return devices.front();
 }
@@ -369,7 +371,7 @@ void ggml_vk_allocate_descriptor_pool(struct ggml_backend_kompute_context * ctx,
       descriptorPoolSizes.data());
 
     ctx->pool = std::make_shared<vk::DescriptorPool>();
-    vk::Result r = komputeManager()->device()->createDescriptorPool(
+    vk::Result r = ctx->manager.device()->createDescriptorPool(
       &descriptorPoolInfo, nullptr, ctx->pool.get());
     if (r != vk::Result::eSuccess)
         std::cerr << "Error allocating descriptor pool" << vk::to_string(r);
@@ -378,7 +380,7 @@ void ggml_vk_allocate_descriptor_pool(struct ggml_backend_kompute_context * ctx,
 static
 void ggml_vk_free_descriptor_pool(struct ggml_backend_kompute_context * ctx) {
     if (ctx->pool) {
-        komputeManager()->device()->destroy(
+        ctx->manager.device()->destroy(
           *ctx->pool,
           (vk::Optional<const vk::AllocationCallbacks>)nullptr);
         ctx->pool = nullptr;
@@ -386,7 +388,7 @@ void ggml_vk_free_descriptor_pool(struct ggml_backend_kompute_context * ctx) {
 }
 
 static
-vk::Buffer *ggml_vk_allocate_buffer(size_t size) {
+vk::Buffer *ggml_vk_allocate_buffer(struct ggml_backend_kompute_context * ctx, size_t size) {
     vk::BufferCreateInfo bufferCreateInfo;
     bufferCreateInfo.size = size;
     bufferCreateInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer |
@@ -395,18 +397,18 @@ vk::Buffer *ggml_vk_allocate_buffer(size_t size) {
     bufferCreateInfo.sharingMode = vk::SharingMode::eExclusive;
 
     vk::Buffer *vkBuffer = new vk::Buffer;
-    vk::Result r = komputeManager()->device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer);
+    vk::Result r = ctx->manager.device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer);
     if (r != vk::Result::eSuccess)
         std::cerr << "Error allocating buffer " << vk::to_string(r) << std::endl;
     return vkBuffer;
 }
 
 static
-vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, vk::MemoryRequirements requirements, bool *isHostVisible) {
+vk::DeviceMemory *ggml_vk_allocate(struct ggml_backend_kompute_context * ctx, size_t size, vk::MemoryPropertyFlags flags, vk::MemoryRequirements requirements, bool *isHostVisible) {
 
     uint32_t memoryTypeIndex = -1;
     bool memoryTypeIndexFound = false;
-    vk::PhysicalDeviceMemoryProperties memoryProperties = komputeManager()->physicalDevice()->getMemoryProperties();
+    vk::PhysicalDeviceMemoryProperties memoryProperties = ctx->manager.physicalDevice()->getMemoryProperties();
     for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) {
         const vk::MemoryType &memoryType = memoryProperties.memoryTypes[i];
         const vk::MemoryHeap &memoryHeap = memoryProperties.memoryHeaps[memoryType.heapIndex];
@@ -435,7 +437,7 @@ vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, v
     allocInfo.allocationSize = size;
     allocInfo.memoryTypeIndex = memoryTypeIndex;
     vk::DeviceMemory *vkDeviceMemory =  new vk::DeviceMemory;
-    vk::Result r = komputeManager()->device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory);
+    vk::Result r = ctx->manager.device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory);
     if (r != vk::Result::eSuccess) {
         std::cerr << "Error allocating memory " << vk::to_string(r) << std::endl;
         throw std::runtime_error("Error allocating vulkan memory.");
@@ -455,31 +457,31 @@ static size_t ggml_vk_aligned_offset(ggml_backend_buffer_t buffer, size_t offset
     return (offset / minStorageBufferOffsetAlignment) * minStorageBufferOffsetAlignment;
 }
 
-static ggml_vk_memory ggml_vk_allocate(size_t size) {
+static ggml_vk_memory ggml_vk_allocate(struct ggml_backend_kompute_context * ctx, size_t size) {
     ggml_vk_memory memory;
     bool isHostVisible = false;
     {
-        memory.primaryBuffer = ggml_vk_allocate_buffer(size);
-        vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.primaryBuffer);
+        memory.primaryBuffer = ggml_vk_allocate_buffer(ctx, size);
+        vk::MemoryRequirements memoryRequirements = ctx->manager.device()->getBufferMemoryRequirements(*memory.primaryBuffer);
         vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eDeviceLocal;
-        memory.primaryMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
-        komputeManager()->device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0);
+        memory.primaryMemory = ggml_vk_allocate(ctx, size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
+        ctx->manager.device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0);
         if (isHostVisible) {
-            vk::Result r = komputeManager()->device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
+            vk::Result r = ctx->manager.device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
             if (r != vk::Result::eSuccess)
                 std::cerr << "Error mapping memory" << vk::to_string(r);
         }
     }
 
     if (!isHostVisible) {
-        memory.stagingBuffer = ggml_vk_allocate_buffer(size);
-        vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.stagingBuffer);
+        memory.stagingBuffer = ggml_vk_allocate_buffer(ctx, size);
+        vk::MemoryRequirements memoryRequirements = ctx->manager.device()->getBufferMemoryRequirements(*memory.stagingBuffer);
         vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eHostVisible |
                                                       vk::MemoryPropertyFlagBits::eHostCoherent |
                                                       vk::MemoryPropertyFlagBits::eHostCached;
-        memory.stagingMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
-        komputeManager()->device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0);
-        vk::Result r = komputeManager()->device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
+        memory.stagingMemory = ggml_vk_allocate(ctx, size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
+        ctx->manager.device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0);
+        vk::Result r = ctx->manager.device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
         if (r != vk::Result::eSuccess)
             std::cerr << "Error mapping memory" << vk::to_string(r);
     }
@@ -488,21 +490,21 @@ static ggml_vk_memory ggml_vk_allocate(size_t size) {
     return memory;
 }
 
-static void ggml_vk_free_memory(ggml_vk_memory &memory)
+static void ggml_vk_free_memory(struct ggml_backend_kompute_context * ctx, ggml_vk_memory &memory)
 {
-    komputeManager()->device()->destroy(
+    ctx->manager.device()->destroy(
       *memory.primaryBuffer,
       (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     if (memory.stagingBuffer) {
-        komputeManager()->device()->destroy(
+        ctx->manager.device()->destroy(
           *memory.stagingBuffer,
           (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
-    komputeManager()->device()->freeMemory(
+    ctx->manager.device()->freeMemory(
       *memory.primaryMemory,
       (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     if (memory.stagingMemory) {
-        komputeManager()->device()->freeMemory(
+        ctx->manager.device()->freeMemory(
           *memory.stagingMemory,
           (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
@@ -528,7 +530,7 @@ ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & of
 }
 
 static
-const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(const struct ggml_tensor * t, uint32_t * alignedOffset = nullptr) {
+const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(struct ggml_backend_kompute_context * ctx, const struct ggml_tensor * t, uint32_t * alignedOffset = nullptr) {
     uint64_t originalOffset = 0;
     auto * res = ggml_vk_find_tensor(t, originalOffset);
     if (!res) {
@@ -546,7 +548,7 @@ const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(const struct ggml_tensor *
         nbytes += *alignedOffset;
     }
 
-    return komputeManager()->tensor(
+    return ctx->manager.tensor(
         t->data,
         nelements,
         nbytes, kp::Tensor::TensorDataTypes::eFloat,
@@ -578,6 +580,7 @@ uint32_t safe_divide(uint32_t a, uint32_t b) {
 }
 
 static void ggml_vk_add(
+    struct ggml_backend_kompute_context *ctx,
     kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
     const std::shared_ptr<kp::Tensor>& inB,
@@ -612,19 +615,21 @@ static void ggml_vk_add(
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    if (!ctx->manager.hasAlgorithm(__func__)) {
+        s_algo = ctx->manager.algorithm<float, PushConstants>(__func__, ctx->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
     } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo = ctx->manager.getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
+        s_algo->updateDescriptors(ctx->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-static void ggml_vk_addrow(kp::Sequence& seq,
+static void ggml_vk_addrow(
+                 struct ggml_backend_kompute_context *ctx,
+                 kp::Sequence& seq,
                  const std::shared_ptr<kp::Tensor>& inA,
                  const std::shared_ptr<kp::Tensor>& inB,
                  const std::shared_ptr<kp::Tensor>& out,
@@ -643,19 +648,20 @@ static void ggml_vk_addrow(kp::Sequence& seq,
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    if (!ctx->manager.hasAlgorithm(__func__))
+        s_algo = ctx->manager.algorithm<float, PushConstants>(__func__, ctx->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo = ctx->manager.getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
+        s_algo->updateDescriptors(ctx->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
 static void ggml_vk_mul(
+    struct ggml_backend_kompute_context *ctx,
     kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
     const std::shared_ptr<kp::Tensor>& inB,
@@ -690,19 +696,21 @@ static void ggml_vk_mul(
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    if (!ctx->manager.hasAlgorithm(__func__)) {
+        s_algo = ctx->manager.algorithm<float, PushConstants>(__func__, ctx->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
     } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo = ctx->manager.getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
+        s_algo->updateDescriptors(ctx->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-static void ggml_vk_scale(kp::Sequence& seq,
+static void ggml_vk_scale(
+                   struct ggml_backend_kompute_context *ctx,
+                   kp::Sequence& seq,
                    const std::shared_ptr<kp::Tensor>& in,
                    const std::shared_ptr<kp::Tensor>& out,
                    uint32_t inOff, uint32_t outOff,
@@ -731,19 +739,20 @@ static void ggml_vk_scale(kp::Sequence& seq,
     }
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(name)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, *spirv, {size}, {}, {pushConsts});
+    if (!ctx->manager.hasAlgorithm(name)) {
+        s_algo = ctx->manager.algorithm<float, PushConstants>(name, ctx->pool.get(), {in, out}, *spirv, {size}, {}, {pushConsts});
     } else {
-        s_algo = komputeManager()->getAlgorithm(name);
+        s_algo = ctx->manager.getAlgorithm(name);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
+        s_algo->updateDescriptors(ctx->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
 static void ggml_vk_xxlu(
+    struct ggml_backend_kompute_context *ctx,
     const std::vector<uint32_t>& spirv, const char * suffix, kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& in,
     const std::shared_ptr<kp::Tensor>& out,
@@ -758,43 +767,44 @@ static void ggml_vk_xxlu(
 
     auto name = std::string(__func__) + "_" + suffix;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(name)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
+    if (!ctx->manager.hasAlgorithm(name)) {
+        s_algo = ctx->manager.algorithm<float, PushConstants>(name, ctx->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
     } else {
-        s_algo = komputeManager()->getAlgorithm(name);
+        s_algo = ctx->manager.getAlgorithm(name);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
+        s_algo->updateDescriptors(ctx->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
 template <typename... Args>
-static void ggml_vk_silu(Args&&... args) {
+static void ggml_vk_silu(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_silu_comp_spv,
         kp::shader_data::op_silu_comp_spv_len);
 
-    ggml_vk_xxlu(spirv, "silu", std::forward<Args>(args)...);
+    ggml_vk_xxlu(ctx, spirv, "silu", std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-static void ggml_vk_relu(Args&&... args) {
+static void ggml_vk_relu(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_relu_comp_spv,
         kp::shader_data::op_relu_comp_spv_len);
 
-    ggml_vk_xxlu(spirv, "relu", std::forward<Args>(args)...);
+    ggml_vk_xxlu(ctx, spirv, "relu", std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-static void ggml_vk_gelu(Args&&... args) {
+static void ggml_vk_gelu(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_gelu_comp_spv,
         kp::shader_data::op_gelu_comp_spv_len);
 
-    ggml_vk_xxlu(spirv, "gelu", std::forward<Args>(args)...);
+    ggml_vk_xxlu(ctx, spirv, "gelu", std::forward<Args>(args)...);
 }
 
 static void ggml_vk_soft_max(
+    struct ggml_backend_kompute_context *ctx,
     kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
     const std::shared_ptr<kp::Tensor>& inB,
@@ -821,21 +831,22 @@ static void ggml_vk_soft_max(
     auto & inB_ = inB ? inB : inA;
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
+    if (!ctx->manager.hasAlgorithm(__func__)) {
         // FIXME: The softmax kernel needs to be fixed to use the subgroupsize which can vary by device
         const uint32_t local_x = 32;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB_, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
+        s_algo = ctx->manager.algorithm<uint32_t, PushConstants>(__func__, ctx->pool.get(), {inA, inB_, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
     } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo = ctx->manager.getAlgorithm(__func__);
         s_algo->setTensors({inA, inB_, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
+        s_algo->updateDescriptors(ctx->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
 static void ggml_vk_norm_(
+    struct ggml_backend_kompute_context *ctx,
     const std::vector<uint32_t>& spirv, const char * suffix, kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& in,
     const std::shared_ptr<kp::Tensor>& out,
@@ -857,35 +868,37 @@ static void ggml_vk_norm_(
 
     auto name = std::string(__func__) + "_" + suffix;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(name)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
+    if (!ctx->manager.hasAlgorithm(name)) {
+        s_algo = ctx->manager.algorithm<float, PushConstants>(name, ctx->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
     } else {
-        s_algo = komputeManager()->getAlgorithm(name);
+        s_algo = ctx->manager.getAlgorithm(name);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({(uint32_t)nrows});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
+        s_algo->updateDescriptors(ctx->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
 template <typename... Args>
-static void ggml_vk_norm(Args&&... args) {
+static void ggml_vk_norm(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_norm_comp_spv,
         kp::shader_data::op_norm_comp_spv_len);
 
-    ggml_vk_norm_(spirv, "norm", std::forward<Args>(args)...);
+    ggml_vk_norm_(ctx, spirv, "norm", std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-static void ggml_vk_rms_norm(Args&&... args) {
+static void ggml_vk_rms_norm(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_rmsnorm_comp_spv,
         kp::shader_data::op_rmsnorm_comp_spv_len);
 
-    ggml_vk_norm_(spirv, "rms", std::forward<Args>(args)...);
+    ggml_vk_norm_(ctx, spirv, "rms", std::forward<Args>(args)...);
 }
 
-static void ggml_vk_diag_mask_inf(kp::Sequence& seq,
+static void ggml_vk_diag_mask_inf(
+                           struct ggml_backend_kompute_context *ctx,
+                           kp::Sequence& seq,
                            const std::shared_ptr<kp::Tensor>& in,
                            const std::shared_ptr<kp::Tensor>& out,
                            uint32_t inOff, uint32_t outOff,
@@ -905,19 +918,20 @@ static void ggml_vk_diag_mask_inf(kp::Sequence& seq,
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
+    if (!ctx->manager.hasAlgorithm(__func__))
+        s_algo = ctx->manager.algorithm<float, PushConstants>(__func__, ctx->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
     else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo = ctx->manager.getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne00), unsigned(ne01), unsigned(ne02)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
+        s_algo->updateDescriptors(ctx->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
 static void ggml_vk_mul_mat_f16(
+    struct ggml_backend_kompute_context *ctx,
     kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
     const std::shared_ptr<kp::Tensor>& inB,
@@ -954,20 +968,22 @@ static void ggml_vk_mul_mat_f16(
     const unsigned ny = unsigned((ne11 + 4 - 1)/4);
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), ny, unsigned(ne12*ne13)}, {local_x}, {pushConsts});
+    if (!ctx->manager.hasAlgorithm(__func__)) {
+        const uint32_t local_x = ggml_vk_current_device(ctx).subgroupSize * 2;
+        s_algo = ctx->manager.algorithm<uint32_t, PushConstants>(__func__, ctx->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), ny, unsigned(ne12*ne13)}, {local_x}, {pushConsts});
     } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo = ctx->manager.getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01), ny, unsigned(ne12*ne13)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
+        s_algo->updateDescriptors(ctx->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-static void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
+static void ggml_vk_mul_mat_mat_f32(
+                         struct ggml_backend_kompute_context *ctx,
+                         kp::Sequence& seq,
                          const std::shared_ptr<kp::Tensor>& inA,
                          const std::shared_ptr<kp::Tensor>& inB,
                          const std::shared_ptr<kp::Tensor>& out,
@@ -993,10 +1009,10 @@ static void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
         nb1, nb2
     };
 
-    const uint32_t local_x = ggml_vk_current_device().subgroupSize;
+    const uint32_t local_x = ggml_vk_current_device(ctx).subgroupSize;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(),
+    if (!ctx->manager.hasAlgorithm(__func__)) {
+        s_algo = ctx->manager.algorithm<uint32_t, PushConstants>(__func__, ctx->pool.get(),
         {inA, inB, out}, spirv,
         {unsigned(ne01),
          unsigned(ne11),
@@ -1005,19 +1021,20 @@ static void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
         {local_x},
         {pushConsts});
     } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo = ctx->manager.getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01),
                               unsigned(ne11),
                               unsigned(std::max(ne12, ne02)),
                               });
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
+        s_algo->updateDescriptors(ctx->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
 static void ggml_vk_mul_mat_impl(
+    struct ggml_backend_kompute_context *ctx,
     const std::vector<uint32_t>& spirv, const char * suffix, uint32_t block_size, kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
     const std::shared_ptr<kp::Tensor>& inB,
@@ -1044,44 +1061,45 @@ static void ggml_vk_mul_mat_impl(
 
     auto name = std::string(__func__) + "_" + suffix;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(name)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
+    if (!ctx->manager.hasAlgorithm(name)) {
+        const uint32_t local_x = ggml_vk_current_device(ctx).subgroupSize * 2;
+        s_algo = ctx->manager.algorithm<uint32_t, PushConstants>(name, ctx->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
     } else {
-        s_algo = komputeManager()->getAlgorithm(name);
+        s_algo = ctx->manager.getAlgorithm(name);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
+        s_algo->updateDescriptors(ctx->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
 template <typename... Args>
-static void ggml_vk_mul_mat_q4_0(Args&&... args) {
+static void ggml_vk_mul_mat_q4_0(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_0_comp_spv,
         kp::shader_data::op_mul_mat_q4_0_comp_spv_len);
 
-    ggml_vk_mul_mat_impl(spirv, "q4_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
+    ggml_vk_mul_mat_impl(ctx, spirv, "q4_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-static void ggml_vk_mul_mat_q4_1(Args&&... args) {
+static void ggml_vk_mul_mat_q4_1(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv,
         kp::shader_data::op_mul_mat_q4_1_comp_spv_len);
 
-    ggml_vk_mul_mat_impl(spirv, "q4_1", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
+    ggml_vk_mul_mat_impl(ctx, spirv, "q4_1", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-static void ggml_vk_mul_mat_q8_0(Args&&... args) {
+static void ggml_vk_mul_mat_q8_0(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q8_0_comp_spv,
         kp::shader_data::op_mul_mat_q8_0_comp_spv_len);
 
-    ggml_vk_mul_mat_impl(spirv, "q8_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
+    ggml_vk_mul_mat_impl(ctx, spirv, "q8_0", 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
 static void ggml_vk_mul_mat_q6_k(
+    struct ggml_backend_kompute_context *ctx,
     kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
     const std::shared_ptr<kp::Tensor>& inB,
@@ -1102,20 +1120,21 @@ static void ggml_vk_mul_mat_q6_k(
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
+    if (!ctx->manager.hasAlgorithm(__func__)) {
+        const uint32_t local_x = ggml_vk_current_device(ctx).subgroupSize * 2;
+        s_algo = ctx->manager.algorithm<uint32_t, PushConstants>(__func__, ctx->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
     } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo = ctx->manager.getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
+        s_algo->updateDescriptors(ctx->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
 static void ggml_vk_get_rows(
+    struct ggml_backend_kompute_context *ctx,
     const std::vector<uint32_t>& spirv,
     const char * suffix,
     unsigned element_size, unsigned qk,
@@ -1141,58 +1160,59 @@ static void ggml_vk_get_rows(
 
     auto name = std::string(__func__) + "_" + suffix;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(name)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    if (!ctx->manager.hasAlgorithm(name)) {
+        s_algo = ctx->manager.algorithm<float, PushConstants>(name, ctx->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     } else {
-        s_algo = komputeManager()->getAlgorithm(name);
+        s_algo = ctx->manager.getAlgorithm(name);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
+        s_algo->updateDescriptors(ctx->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
 template <typename... Args>
-static void ggml_vk_get_rows_f32(Args&&... args) {
+static void ggml_vk_get_rows_f32(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f32_comp_spv,
         kp::shader_data::op_getrows_f32_comp_spv_len);
 
-    ggml_vk_get_rows(spirv, "f32", sizeof(float), 0, std::forward<Args>(args)...);
+    ggml_vk_get_rows(ctx, spirv, "f32", sizeof(float), 0, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-static void ggml_vk_get_rows_f16(Args&&... args) {
+static void ggml_vk_get_rows_f16(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
         kp::shader_data::op_getrows_f16_comp_spv_len);
 
-    ggml_vk_get_rows(spirv, "f16", sizeof(half), 0, std::forward<Args>(args)...);
+    ggml_vk_get_rows(ctx, spirv, "f16", sizeof(half), 0, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-static void ggml_vk_get_rows_q4_0(Args&&... args) {
+static void ggml_vk_get_rows_q4_0(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_0_comp_spv,
         kp::shader_data::op_getrows_q4_0_comp_spv_len);
 
-    ggml_vk_get_rows(spirv, "q4_0", 1/*We access blocks unaligned*/, QK4_0, std::forward<Args>(args)...);
+    ggml_vk_get_rows(ctx, spirv, "q4_0", 1/*We access blocks unaligned*/, QK4_0, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-static void ggml_vk_get_rows_q4_1(Args&&... args) {
+static void ggml_vk_get_rows_q4_1(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_1_comp_spv,
         kp::shader_data::op_getrows_q4_1_comp_spv_len);
 
-    ggml_vk_get_rows(spirv, "q4_1", 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...);
+    ggml_vk_get_rows(ctx, spirv, "q4_1", 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-static void ggml_vk_get_rows_q6_k(Args&&... args) {
+static void ggml_vk_get_rows_q6_k(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv,
         kp::shader_data::op_getrows_q6_k_comp_spv_len);
-    ggml_vk_get_rows(spirv, "q6_k", 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
+    ggml_vk_get_rows(ctx, spirv, "q6_k", 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
 }
 
 static void ggml_vk_rope(
+    struct ggml_backend_kompute_context *ctx,
     kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
     const std::shared_ptr<kp::Tensor>& inB,
@@ -1243,23 +1263,24 @@ static void ggml_vk_rope(
 
     auto name = std::string(__func__) + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(name)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(
-            name, s_kompute_context->pool.get(), {inA, inB, out},
+    if (!ctx->manager.hasAlgorithm(name)) {
+        s_algo = ctx->manager.algorithm<float, PushConstants>(
+            name, ctx->pool.get(), {inA, inB, out},
             src0t == GGML_TYPE_F16 ? spirv_f16 : spirv_f32,
             {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
         );
     } else {
-        s_algo = komputeManager()->getAlgorithm(name);
+        s_algo = ctx->manager.getAlgorithm(name);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
+        s_algo->updateDescriptors(ctx->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
 static void ggml_vk_cpy(
+    struct ggml_backend_kompute_context *ctx,
     const std::vector<uint32_t>& spirv,
     uint32_t in_element_size, uint32_t out_element_size,
     kp::Sequence& seq,
@@ -1289,44 +1310,44 @@ static void ggml_vk_cpy(
                        + "_i_" + std::to_string(in_element_size)
                        + "_o_" + std::to_string(out_element_size);
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(name))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(name, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    if (!ctx->manager.hasAlgorithm(name))
+        s_algo = ctx->manager.algorithm<float, PushConstants>(name, ctx->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
     else {
-        s_algo = komputeManager()->getAlgorithm(name);
+        s_algo = ctx->manager.getAlgorithm(name);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
+        s_algo->updateDescriptors(ctx->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
 template <typename... Args>
-static void ggml_vk_cpy_f32_f16(Args&&... args) {
+static void ggml_vk_cpy_f32_f16(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f16_comp_spv,
         kp::shader_data::op_cpy_f32_f16_comp_spv_len);
-    ggml_vk_cpy(spirv, 4, 2, std::forward<Args>(args)...);
+    ggml_vk_cpy(ctx, spirv, 4, 2, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-static void ggml_vk_cpy_f32_f32(Args&&... args) {
+static void ggml_vk_cpy_f32_f32(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f32_comp_spv,
         kp::shader_data::op_cpy_f32_f32_comp_spv_len);
-    ggml_vk_cpy(spirv, 4, 4, std::forward<Args>(args)...);
+    ggml_vk_cpy(ctx, spirv, 4, 4, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-static void ggml_vk_cpy_f16_f16(Args&&... args) {
+static void ggml_vk_cpy_f16_f16(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f16_comp_spv,
         kp::shader_data::op_cpy_f16_f16_comp_spv_len);
-    ggml_vk_cpy(spirv, 2, 2, std::forward<Args>(args)...);
+    ggml_vk_cpy(ctx, spirv, 2, 2, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-static void ggml_vk_cpy_f16_f32(Args&&... args) {
+static void ggml_vk_cpy_f16_f32(struct ggml_backend_kompute_context *ctx, Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f32_comp_spv,
         kp::shader_data::op_cpy_f16_f32_comp_spv_len);
-    ggml_vk_cpy(spirv, 2, 4, std::forward<Args>(args)...);
+    ggml_vk_cpy(ctx, spirv, 2, 4, std::forward<Args>(args)...);
 }
 
 static bool ggml_vk_supports_op(const struct ggml_tensor * op) {
@@ -1428,7 +1449,7 @@ static void ggml_vk_graph_compute(struct ggml_backend_kompute_context * ctx, str
     std::vector<std::shared_ptr<kp::Sequence>> sequences(n_seq);
 
     for (auto& sequence : sequences) {
-        sequence = komputeManager()->sequence();
+        sequence = ctx->manager.sequence();
     }
     for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) {
         const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq;
@@ -1507,19 +1528,19 @@ static void ggml_vk_graph_compute(struct ggml_backend_kompute_context * ctx, str
             uint32_t off_src0 = 0;
             uint32_t off_src1 = 0;
             uint32_t off_dst  = 0;
-            const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
-            const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
-            const std::shared_ptr<kp::Tensor>& id_dst  = dst  ? ggml_vk_get_tensor(dst,  &off_dst)  : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0, &off_src0) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1, &off_src1) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_dst  = dst  ? ggml_vk_get_tensor(ctx, dst,  &off_dst)  : nullTensor;
 
             switch (dst->op) {
                 case GGML_OP_ADD:
                     {
                         if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
                             // src1 is a row
-                            ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
+                            ggml_vk_addrow(ctx, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
                         } else {
                             ggml_vk_add(
-                                seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                ctx, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
                                 ne00, ne01, ne02, ne03,
                                 nb00, nb01, nb02, nb03,
                                 ne10, ne11, ne12, ne13,
@@ -1532,7 +1553,7 @@ static void ggml_vk_graph_compute(struct ggml_backend_kompute_context * ctx, str
                 case GGML_OP_MUL:
                     {
                         ggml_vk_mul(
-                            seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                            ctx, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
                             ne00, ne01, ne02, ne03,
                             nb00, nb01, nb02, nb03,
                             ne10, ne11, ne12, ne13,
@@ -1545,7 +1566,7 @@ static void ggml_vk_graph_compute(struct ggml_backend_kompute_context * ctx, str
                     {
                         float scale; memcpy(&scale, dst->op_params, sizeof(float));
 
-                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale);
+                        ggml_vk_scale(ctx, seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale);
                     } break;
                 case GGML_OP_UNARY:
                     {
@@ -1554,16 +1575,16 @@ static void ggml_vk_graph_compute(struct ggml_backend_kompute_context * ctx, str
                         switch (ggml_get_unary_op(gf->nodes[i])) {
                             case GGML_UNARY_OP_SILU:
                                 {
-                                    ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, n/4);
+                                    ggml_vk_silu(ctx, seq, id_src0, id_dst, off_src0, off_dst, n/4);
                                 } break;
                             case GGML_UNARY_OP_RELU:
                                 {
-                                    ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, n/4);
+                                    ggml_vk_relu(ctx, seq, id_src0, id_dst, off_src0, off_dst, n/4);
                                 } break;
                             case GGML_UNARY_OP_GELU:
                                 {
                                     GGML_ASSERT(n % 8 == 0);
-                                    ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, n/8);
+                                    ggml_vk_gelu(ctx, seq, id_src0, id_dst, off_src0, off_dst, n/8);
                                 } break;
                             default:
                                 {
@@ -1588,18 +1609,18 @@ static void ggml_vk_graph_compute(struct ggml_backend_kompute_context * ctx, str
 #pragma message("ref:  https://github.com/ggerganov/llama.cpp/pull/7192")
                         GGML_ASSERT(max_bias == 0.0f);
 
-                        ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
+                        ggml_vk_soft_max(ctx, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
                     } break;
                 case GGML_OP_DIAG_MASK_INF:
                     {
                         const int n_past = ((int32_t *)(dst->op_params))[0];
-                        ggml_vk_diag_mask_inf(seq, id_src0, id_dst, off_src0, off_dst, n_past, ne00, ne01, ne02);
+                        ggml_vk_diag_mask_inf(ctx, seq, id_src0, id_dst, off_src0, off_dst, n_past, ne00, ne01, ne02);
                     } break;
                 case GGML_OP_NORM:
                     {
                         float eps;
                         memcpy(&eps, dst->op_params, sizeof(float));
-                        ggml_vk_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);
+                        ggml_vk_norm(ctx, seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);
                     } break;
                 case GGML_OP_RMS_NORM:
                     {
@@ -1607,7 +1628,7 @@ static void ggml_vk_graph_compute(struct ggml_backend_kompute_context * ctx, str
 
                         float eps;
                         memcpy(&eps, dst->op_params, sizeof(float));
-                        ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);
+                        ggml_vk_rms_norm(ctx, seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);
                     } break;
                 case GGML_OP_MUL_MAT:
                     {
@@ -1633,38 +1654,38 @@ static void ggml_vk_graph_compute(struct ggml_backend_kompute_context * ctx, str
                         switch (src0t) {
                             case GGML_TYPE_F32:
                                 ggml_vk_mul_mat_mat_f32(
-                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                    ctx, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
                                     ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, nb1, nb2
                                 );
                                 break;
                             case GGML_TYPE_F16:
                                 ggml_vk_mul_mat_f16(
-                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                    ctx, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
                                     ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, ne13, nb10, nb11, nb12,
                                     ne0, ne1, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q8_0:
                                 ggml_vk_mul_mat_q8_0(
-                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                    ctx, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
                                     ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q4_0:
                                 ggml_vk_mul_mat_q4_0(
-                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                    ctx, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
                                     ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q4_1:
                                 ggml_vk_mul_mat_q4_1(
-                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                    ctx, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
                                     ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
                                 );
                                 break;
                             case GGML_TYPE_Q6_K:
                                 ggml_vk_mul_mat_q6_k(
-                                    seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                    ctx, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
                                     ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02
                                 );
                                 break;
@@ -1678,15 +1699,15 @@ static void ggml_vk_graph_compute(struct ggml_backend_kompute_context * ctx, str
                 case GGML_OP_GET_ROWS:
                     {
                         if (src0t == GGML_TYPE_F32) {
-                            ggml_vk_get_rows_f32(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                            ggml_vk_get_rows_f32(ctx, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                         } else if (src0t == GGML_TYPE_F16) {
-                            ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                            ggml_vk_get_rows_f16(ctx, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                         } else if (src0t == GGML_TYPE_Q4_0) {
-                            ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                            ggml_vk_get_rows_q4_0(ctx, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                         } else if (src0t == GGML_TYPE_Q4_1) {
-                            ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                            ggml_vk_get_rows_q4_1(ctx, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                         } else if (src0t == GGML_TYPE_Q6_K) {
-                            ggml_vk_get_rows_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                            ggml_vk_get_rows_q6_k(ctx, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                         } else {
                             fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
                             goto not_implemented;
@@ -1717,7 +1738,7 @@ static void ggml_vk_graph_compute(struct ggml_backend_kompute_context * ctx, str
                         memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
                         memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
                         ggml_vk_rope(
-                            seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_ctx_orig,
+                            ctx, seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_ctx_orig,
                             freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
                             ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
                         );
@@ -1730,16 +1751,16 @@ static void ggml_vk_graph_compute(struct ggml_backend_kompute_context * ctx, str
                             case GGML_TYPE_F32:
                                 {
                                     switch (dstt) {
-                                        case GGML_TYPE_F16: ggml_vk_cpy_f32_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
-                                        case GGML_TYPE_F32: ggml_vk_cpy_f32_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
+                                        case GGML_TYPE_F16: ggml_vk_cpy_f32_f16(ctx, seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
+                                        case GGML_TYPE_F32: ggml_vk_cpy_f32_f32(ctx, seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
                                         default: goto not_implemented;
                                     }
                                 } break;
                             case GGML_TYPE_F16:
                                 {
                                     switch (dstt) {
-                                        case GGML_TYPE_F16: ggml_vk_cpy_f16_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
-                                        case GGML_TYPE_F32: ggml_vk_cpy_f16_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
+                                        case GGML_TYPE_F16: ggml_vk_cpy_f16_f16(ctx, seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
+                                        case GGML_TYPE_F32: ggml_vk_cpy_f16_f32(ctx, seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
                                     default: goto not_implemented;
                                 } break;
                             default: goto not_implemented;
@@ -1802,7 +1823,7 @@ static void ggml_backend_kompute_device_ref(ggml_backend_buffer_type_t buft) {
     auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
 
     if (!ctx->device_ref) {
-        komputeManager()->initializeDevice(
+        s_kompute_context->manager.initializeDevice(
             ctx->device, {}, {
                 "VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
                 "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"
@@ -1810,7 +1831,7 @@ static void ggml_backend_kompute_device_ref(ggml_backend_buffer_type_t buft) {
         );
     }
 
-    assert(ggml_vk_has_device());
+    assert(ggml_vk_has_device(s_kompute_context));
     ctx->device_ref++;
 }
 
@@ -1822,7 +1843,7 @@ static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
     ctx->device_ref--;
 
     if (!ctx->device_ref) {
-        komputeManager.destroy();
+        s_kompute_context->manager.destroy();
     }
 }
 
@@ -1833,8 +1854,8 @@ static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t b
 
 static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     auto * ctx = static_cast<ggml_backend_kompute_buffer_context *>(buffer->context);
-    if (ggml_vk_has_device()) {
-        ggml_vk_free_memory(ctx->memory);
+    if (ggml_vk_has_device(s_kompute_context)) {
+        ggml_vk_free_memory(s_kompute_context, ctx->memory);
     }
     delete ctx;
 }
@@ -1847,21 +1868,21 @@ static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer)
 static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     GGML_UNUSED(buffer);
 
-    const auto res = ggml_vk_get_tensor(tensor);
+    const auto res = ggml_vk_get_tensor(s_kompute_context, tensor);
     GGML_ASSERT(res);
 
     memcpy((char *)tensor->data + offset, data, size);
 
-    komputeManager()->sequence()->eval<kp::OpTensorSyncDevice>({res});
+    s_kompute_context->manager.sequence()->eval<kp::OpTensorSyncDevice>({res});
 }
 
 static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     GGML_UNUSED(buffer);
 
-    const auto res = ggml_vk_get_tensor(tensor);
+    const auto res = ggml_vk_get_tensor(s_kompute_context, tensor);
     GGML_ASSERT(res);
 
-    komputeManager()->sequence()->eval<kp::OpTensorSyncLocal>({res});
+    s_kompute_context->manager.sequence()->eval<kp::OpTensorSyncLocal>({res});
 
     memcpy(data, (const char *)tensor->data + offset, size);
 }
@@ -1871,7 +1892,7 @@ static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint
     memset(ctx->memory.data, value, ctx->memory.size);
 
     if (ctx->memory.stagingBuffer)
-        komputeManager()->sequence()->eval<kp::OpBufferSyncDevice>(ctx->memory.primaryBuffer, ctx->memory.stagingBuffer, ctx->memory.size);
+        s_kompute_context->manager.sequence()->eval<kp::OpBufferSyncDevice>(ctx->memory.primaryBuffer, ctx->memory.stagingBuffer, ctx->memory.size);
 }
 
 static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
@@ -1896,7 +1917,7 @@ static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffe
 static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     ggml_backend_kompute_device_ref(buft);
     auto * ctx = new ggml_backend_kompute_buffer_context;
-    ctx->memory = ggml_vk_allocate(size);
+    ctx->memory = ggml_vk_allocate(s_kompute_context, size);
     return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size);
 }
 

From 3676778e82039c953917cd366a7c90741fae3965 Mon Sep 17 00:00:00 2001
From: Cong Liu <liucong@kylinos.cn>
Date: Wed, 21 Aug 2024 15:38:51 +0800
Subject: [PATCH 05/10] ggml/kompute: Implement ggml_backend_i.offload_op
 interface

Signed-off-by: Cong Liu <liucong@kylinos.cn>
---
 ggml/src/ggml-kompute.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index 911827dbdb76e..602d9e97e08cd 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -2002,6 +2002,14 @@ static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_back
     return &ctx->buft == buft;
 }
 
+static bool ggml_backend_kompute_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
+    GGML_UNUSED(backend);
+    const int min_batch_size = 32;
+
+    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
+           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
+}
+
 static struct ggml_backend_i kompute_backend_i = {
     /* .get_name                = */ ggml_backend_kompute_name,
     /* .free                    = */ ggml_backend_kompute_free,
@@ -2017,7 +2025,7 @@ static struct ggml_backend_i kompute_backend_i = {
     /* .graph_compute           = */ ggml_backend_kompute_graph_compute,
     /* .supports_op             = */ ggml_backend_kompute_supports_op,
     /* .supports_buft           = */ ggml_backend_kompute_supports_buft,
-    /* .offload_op              = */ NULL,
+    /* .offload_op              = */ ggml_backend_kompute_offload_op,
     /* .event_new               = */ NULL,
     /* .event_free              = */ NULL,
     /* .event_record            = */ NULL,

From f57f8cb3da7f2fdf86f86d58a02a4422a21ef7d8 Mon Sep 17 00:00:00 2001
From: Cong Liu <liucong@kylinos.cn>
Date: Wed, 21 Aug 2024 16:06:00 +0800
Subject: [PATCH 06/10] ggml/kompute: Reimplement kompute_manager

Signed-off-by: Cong Liu <liucong@kylinos.cn>
---
 ggml/include/ggml-kompute.h |   2 +
 ggml/src/ggml-kompute.cpp   | 225 ++++++++++++++++++++++++++----------
 2 files changed, 167 insertions(+), 60 deletions(-)

diff --git a/ggml/include/ggml-kompute.h b/ggml/include/ggml-kompute.h
index b90143c070356..5f3f5e8d496a5 100644
--- a/ggml/include/ggml-kompute.h
+++ b/ggml/include/ggml-kompute.h
@@ -11,6 +11,8 @@
 extern "C" {
 #endif
 
+#define GGML_KOMPUTE_MAX_DEVICES 16
+
 struct ggml_vk_device {
     int index;
     int type; // same as VkPhysicalDeviceType
diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index 602d9e97e08cd..82f681f2df3f3 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -85,28 +85,18 @@ struct ggml_backend_kompute_context {
 // is only created when a device is set and vulkan is explicitly turned on.
 static ggml_backend_kompute_context *s_kompute_context = nullptr;
 
-class kompute_manager {
-    kp::Manager *s_mgr = nullptr;
 
-public:
-    kp::Manager *operator()() {
-        if (s_mgr && !s_mgr->hasInstance()) {
-            destroy();
-        }
-        if (!s_mgr) {
-            s_mgr = new kp::Manager;
-        }
-        return s_mgr;
-    }
+struct ggml_backend_kompute_buffer_type_context {
+    int         device;
+    int         device_ref = 0;
+    uint64_t    buffer_alignment;
+    uint64_t    max_alloc;
+    std::string name;
 
-    void destroy() {
-        delete s_mgr;
-        s_mgr = nullptr;
-    }
+    ggml_backend_kompute_buffer_type_context(int device, uint64_t buffer_alignment, uint64_t max_alloc)
+        : device(device), buffer_alignment(buffer_alignment), max_alloc(max_alloc), name(ggml_kompute_format_name(device)) {}
 };
 
-static kompute_manager komputeManager;
-
 struct ggml_vk_memory {
     void *data = nullptr;
     size_t size = 0;
@@ -120,6 +110,61 @@ struct ggml_backend_kompute_buffer_context {
     struct ggml_vk_memory memory;
 };
 
+class kompute_manager {
+public:
+    kompute_manager();
+    ~kompute_manager();
+
+    kp::Manager *get_kp_manager(void);
+    ggml_backend_t create_backend(int device);
+    void destroy_backend(ggml_backend_t backend);
+    ggml_backend_t get_backend(int device);
+
+private:
+    // Only for global queries, not for creating devices
+    kp::Manager *m_kp_manager;
+
+    std::vector<ggml_backend_t> m_backends;
+};
+
+
+static kompute_manager komputeManager;
+
+
+static ggml_backend_t kompute_backend(int device)
+{
+    return komputeManager.get_backend(device);
+}
+
+static ggml_backend_t kompute_backend(ggml_backend_buffer_type_t buffer_type)
+{
+    auto *buft_ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buffer_type->context);
+    return kompute_backend(buft_ctx->device);
+}
+
+static ggml_backend_t kompute_backend(ggml_backend_buffer_t buffer)
+{
+    return kompute_backend(buffer->buft);
+}
+
+static ggml_backend_kompute_context *kompute_backend_context(int device)
+{
+    auto * backend = kompute_backend(device);
+    return backend ? static_cast<ggml_backend_kompute_context *>(backend->context) : nullptr;
+}
+
+static ggml_backend_kompute_context *kompute_backend_context(ggml_backend_buffer_t buffer)
+{
+    auto * backend = kompute_backend(buffer);
+    return backend ? static_cast<ggml_backend_kompute_context *>(backend->context) : nullptr;
+}
+
+static ggml_backend_kompute_context *kompute_backend_context(ggml_backend_buffer_type_t buffer_type)
+{
+    auto * backend = kompute_backend(buffer_type);
+    return backend ? static_cast<ggml_backend_kompute_context *>(backend->context) : nullptr;
+}
+
 #ifdef __linux__
 __attribute__((constructor))
 static void enable_sam() {
@@ -175,12 +220,12 @@ static const char * ggml_vk_getVendorName(uint32_t vendorID) {
 
 static std::vector<ggml_vk_device> ggml_vk_available_devices_internal(size_t memoryRequired) {
     std::vector<ggml_vk_device> results;
-    if (!komputeManager()->hasVulkan() || !komputeManager()->hasInstance())
+    if (!komputeManager.get_kp_manager()->hasVulkan() || !komputeManager.get_kp_manager()->hasInstance())
         return results;
 
     std::vector<vk::PhysicalDevice> physical_devices;
     try {
-        physical_devices = komputeManager()->listDevices();
+        physical_devices = komputeManager.get_kp_manager()->listDevices();
     } catch (vk::SystemError & err) {
         std::cerr << __func__ << ": ignoring Vulkan exception: " << err.what() << "\n";
         return results;
@@ -338,7 +383,7 @@ bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const ch
 }
 
 bool ggml_vk_has_vulkan() {
-    return komputeManager()->hasVulkan();
+    return komputeManager.get_kp_manager()->hasVulkan();
 }
 
 static bool ggml_vk_has_device(struct ggml_backend_kompute_context *ctx) {
@@ -1808,16 +1853,6 @@ kp::TensorT<uint8_t>::dataType()
 
 // backend interface
 
-struct ggml_backend_kompute_buffer_type_context {
-    int         device;
-    int         device_ref = 0;
-    uint64_t    buffer_alignment;
-    uint64_t    max_alloc;
-    std::string name;
-
-    ggml_backend_kompute_buffer_type_context(int device, uint64_t buffer_alignment, uint64_t max_alloc)
-        : device(device), buffer_alignment(buffer_alignment), max_alloc(max_alloc), name(ggml_kompute_format_name(device)) {}
-};
 
 static void ggml_backend_kompute_device_ref(ggml_backend_buffer_type_t buft) {
     auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
@@ -1854,8 +1889,9 @@ static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t b
 
 static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     auto * ctx = static_cast<ggml_backend_kompute_buffer_context *>(buffer->context);
-    if (ggml_vk_has_device(s_kompute_context)) {
-        ggml_vk_free_memory(s_kompute_context, ctx->memory);
+    auto * backend_ctx = kompute_backend_context(buffer);
+    if (backend_ctx && ggml_vk_has_device(backend_ctx)) {
+        ggml_vk_free_memory(backend_ctx, ctx->memory);
     }
     delete ctx;
 }
@@ -1866,33 +1902,34 @@ static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer)
 }
 
 static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    GGML_UNUSED(buffer);
+    auto * backend_ctx = kompute_backend_context(buffer);
 
-    const auto res = ggml_vk_get_tensor(s_kompute_context, tensor);
+    const auto res = ggml_vk_get_tensor(backend_ctx, tensor);
     GGML_ASSERT(res);
 
     memcpy((char *)tensor->data + offset, data, size);
 
-    s_kompute_context->manager.sequence()->eval<kp::OpTensorSyncDevice>({res});
+    backend_ctx->manager.sequence()->eval<kp::OpTensorSyncDevice>({res});
 }
 
 static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_UNUSED(buffer);
+    auto * backend_ctx = kompute_backend_context(buffer);
 
-    const auto res = ggml_vk_get_tensor(s_kompute_context, tensor);
+    const auto res = ggml_vk_get_tensor(backend_ctx, tensor);
     GGML_ASSERT(res);
 
-    s_kompute_context->manager.sequence()->eval<kp::OpTensorSyncLocal>({res});
+    backend_ctx->manager.sequence()->eval<kp::OpTensorSyncLocal>({res});
 
     memcpy(data, (const char *)tensor->data + offset, size);
 }
 
 static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     auto * ctx = static_cast<ggml_backend_kompute_buffer_context *>(buffer->context);
+    auto * backend_ctx = kompute_backend_context(buffer);
     memset(ctx->memory.data, value, ctx->memory.size);
 
     if (ctx->memory.stagingBuffer)
-        s_kompute_context->manager.sequence()->eval<kp::OpBufferSyncDevice>(ctx->memory.primaryBuffer, ctx->memory.stagingBuffer, ctx->memory.size);
+        backend_ctx->manager.sequence()->eval<kp::OpBufferSyncDevice>(ctx->memory.primaryBuffer, ctx->memory.stagingBuffer, ctx->memory.size);
 }
 
 static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
@@ -1915,9 +1952,9 @@ static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffe
 }
 
 static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    ggml_backend_kompute_device_ref(buft);
+    auto * backend_ctx = kompute_backend_context(buft);
     auto * ctx = new ggml_backend_kompute_buffer_context;
-    ctx->memory = ggml_vk_allocate(s_kompute_context, size);
+    ctx->memory = ggml_vk_allocate(backend_ctx, size);
     return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size);
 }
 
@@ -1941,10 +1978,9 @@ static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
 };
 
 ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
-    if (!s_kompute_context)
-	    s_kompute_context = new ggml_backend_kompute_context(device);
+    auto * backend = komputeManager.create_backend(device);
+    auto * buft = &(static_cast<ggml_backend_kompute_context *>(backend->context))->buft;
 
-    auto * buft = &s_kompute_context->buft;
     if (!buft->context) {
         auto devices = ggml_vk_available_devices_internal(0);
         for (std::size_t i = 0; i < devices.size(); i++) {
@@ -1970,15 +2006,7 @@ static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
 }
 
 static void ggml_backend_kompute_free(ggml_backend_t backend) {
-    auto * ctx = static_cast<ggml_backend_kompute_context *>(backend->context);
-
-    assert(ctx == s_kompute_context);
-    s_kompute_context = nullptr;
-    if (ctx != nullptr) {
-        delete ctx;
-    }
-
-    delete backend;
+    komputeManager.destroy_backend(backend);
 }
 
 static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
@@ -2038,17 +2066,94 @@ static ggml_guid_t ggml_backend_kompute_guid() {
     return &guid;
 }
 
-ggml_backend_t ggml_backend_kompute_init(int device) {
-    if (!s_kompute_context)
-	    s_kompute_context = new ggml_backend_kompute_context(device);
 
-    ggml_backend_t kompute_backend = new ggml_backend {
+
+kompute_manager::kompute_manager() : m_backends(GGML_KOMPUTE_MAX_DEVICES, nullptr)
+{
+    m_kp_manager = nullptr;
+}
+
+kompute_manager::~kompute_manager()
+{
+    if (m_kp_manager) {
+        delete m_kp_manager;
+        m_kp_manager = nullptr;
+    }
+
+    for (std::size_t i = 0; i < m_backends.size(); i++) {
+        destroy_backend(m_backends[i]);
+    }
+}
+
+kp::Manager * kompute_manager::get_kp_manager(void)
+{
+    if (!m_kp_manager)
+        m_kp_manager = new kp::Manager;
+
+    return m_kp_manager;
+}
+
+ggml_backend_t kompute_manager::create_backend(int device)
+{
+    if (device < 0 || device >= GGML_KOMPUTE_MAX_DEVICES)
+        return nullptr;
+
+    // already exist
+    ggml_backend_t backend = get_backend(device);
+    if (backend)
+        return backend;
+
+    // create new one
+    auto *context = new ggml_backend_kompute_context(device);
+    context->manager.initializeDevice(device, {},
+        {
+            "VK_KHR_shader_float16_int8",
+            "VK_KHR_8bit_storage",
+            "VK_KHR_16bit_storage",
+            "VK_KHR_shader_non_semantic_info"
+        });
+
+    backend = new ggml_backend {
         /* .guid      = */ ggml_backend_kompute_guid(),
         /* .interface = */ kompute_backend_i,
-        /* .context   = */ s_kompute_context,
+        /* .context   = */ context,
     };
 
-    return kompute_backend;
+    m_backends[device] = backend;
+
+    std::cerr << "Kompute: Init device " << device << std::endl;
+
+    return backend;
+}
+
+void kompute_manager::destroy_backend(ggml_backend_t backend)
+{
+    if (!backend)
+        return;
+
+    for (std::size_t i = 0; i < m_backends.size(); i++) {
+        if (backend == m_backends[i]) {
+            auto *context = static_cast<ggml_backend_kompute_context *>(backend->context);
+            delete context;
+            delete backend;
+            m_backends[i] = nullptr;
+            break;
+        }
+    }
+}
+
+ggml_backend_t kompute_manager::get_backend(int device)
+{
+    if (device >= 0 && static_cast<std::size_t>(device) < m_backends.size())
+        return m_backends[device];
+
+    return nullptr;
+}
+
+
+
+ggml_backend_t ggml_backend_kompute_init(int device) {
+    return komputeManager.create_backend(device);
 }
 
 bool ggml_backend_is_kompute(ggml_backend_t backend) {

From cc9514f9416f75165ac1b582b7d053b9d8cddbf4 Mon Sep 17 00:00:00 2001
From: Feng Jiang <jiangfeng@kylinos.cn>
Date: Wed, 21 Aug 2024 16:41:26 +0800
Subject: [PATCH 07/10] ggml/kompute: Remove unused
 ggml_backend_kompute_device_{ref, unref}()

Signed-off-by: Feng Jiang <jiangfeng@kylinos.cn>
---
 ggml/src/ggml-kompute.cpp | 35 -----------------------------------
 1 file changed, 35 deletions(-)

diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index 82f681f2df3f3..c0b4a06000a75 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -79,12 +79,6 @@ struct ggml_backend_kompute_context {
         : device(device), name(ggml_kompute_format_name(device)) { buft.context = nullptr; }
 };
 
-// FIXME: It would be good to consolidate the kompute manager and the kompute context into one object
-// and consolidate the init functions and simplify object lifetime management. As it currently stands,
-// we *have* to have the kompute manager no matter what for device discovery, but the kompute context
-// is only created when a device is set and vulkan is explicitly turned on.
-static ggml_backend_kompute_context *s_kompute_context = nullptr;
-
 
 struct ggml_backend_kompute_buffer_type_context {
     int         device;
@@ -1853,35 +1847,6 @@ kp::TensorT<uint8_t>::dataType()
 
 // backend interface
 
-
-static void ggml_backend_kompute_device_ref(ggml_backend_buffer_type_t buft) {
-    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
-
-    if (!ctx->device_ref) {
-        s_kompute_context->manager.initializeDevice(
-            ctx->device, {}, {
-                "VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
-                "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"
-            }
-        );
-    }
-
-    assert(ggml_vk_has_device(s_kompute_context));
-    ctx->device_ref++;
-}
-
-static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) {
-    auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buft->context);
-
-    assert(ctx->device_ref > 0);
-
-    ctx->device_ref--;
-
-    if (!ctx->device_ref) {
-        s_kompute_context->manager.destroy();
-    }
-}
-
 static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
     auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buffer->buft->context);
     return ctx->name.c_str();

From 97efd5047a378712b272d76561e4176f5fb4b729 Mon Sep 17 00:00:00 2001
From: Feng Jiang <jiangfeng@kylinos.cn>
Date: Wed, 21 Aug 2024 16:44:12 +0800
Subject: [PATCH 08/10] ggml/kompute: Introduce
 ggml_backend_kompute_get_device_count()

Signed-off-by: Feng Jiang <jiangfeng@kylinos.cn>
---
 ggml/include/ggml-kompute.h | 1 +
 ggml/src/ggml-kompute.cpp   | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/ggml/include/ggml-kompute.h b/ggml/include/ggml-kompute.h
index 5f3f5e8d496a5..1de2cb4783176 100644
--- a/ggml/include/ggml-kompute.h
+++ b/ggml/include/ggml-kompute.h
@@ -25,6 +25,7 @@ struct ggml_vk_device {
 };
 
 struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
+int ggml_backend_kompute_get_device_count(void);
 bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
 bool ggml_vk_has_vulkan(void);
 
diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index c0b4a06000a75..21fe76648638c 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -334,6 +334,11 @@ ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count
     return arr;
 }
 
+int ggml_backend_kompute_get_device_count(void) {
+    auto devices = ggml_vk_available_devices_internal(0);
+    return devices.size();
+}
+
 static void ggml_vk_filterByVendor(std::vector<ggml_vk_device>& devices, const std::string& targetVendor) {
     devices.erase(
         std::remove_if(devices.begin(), devices.end(),

From 56c5f988eb5bfc35b5b7b70964f81a17a45e5374 Mon Sep 17 00:00:00 2001
From: Feng Jiang <jiangfeng@kylinos.cn>
Date: Wed, 21 Aug 2024 16:44:44 +0800
Subject: [PATCH 09/10] ggml/kompute: Introduce
 ggml_backend_kompute_get_device_memory()

Signed-off-by: Feng Jiang <jiangfeng@kylinos.cn>
---
 ggml/include/ggml-kompute.h |  1 +
 ggml/src/ggml-kompute.cpp   | 13 +++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/ggml/include/ggml-kompute.h b/ggml/include/ggml-kompute.h
index 1de2cb4783176..7d76236142d09 100644
--- a/ggml/include/ggml-kompute.h
+++ b/ggml/include/ggml-kompute.h
@@ -26,6 +26,7 @@ struct ggml_vk_device {
 
 struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
 int ggml_backend_kompute_get_device_count(void);
+void ggml_backend_kompute_get_device_memory(int device, size_t * free, size_t * total);
 bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
 bool ggml_vk_has_vulkan(void);
 
diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index 21fe76648638c..dfecf0881023c 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -339,6 +339,19 @@ int ggml_backend_kompute_get_device_count(void) {
     return devices.size();
 }
 
+
+void ggml_backend_kompute_get_device_memory(int device, size_t * free, size_t * total) {
+    auto devices = ggml_vk_available_devices_internal(0);
+
+    for (std::size_t i = 0; i < devices.size(); i++) {
+        if (devices[i].index == device) {
+            *total = devices[i].heapSize;
+            *free = devices[i].heapSize;
+            break;
+        }
+    }
+}
+
 static void ggml_vk_filterByVendor(std::vector<ggml_vk_device>& devices, const std::string& targetVendor) {
     devices.erase(
         std::remove_if(devices.begin(), devices.end(),

From 424e3a52fe464b762a5d972dd730972f57f15f93 Mon Sep 17 00:00:00 2001
From: Feng Jiang <jiangfeng@kylinos.cn>
Date: Wed, 21 Aug 2024 16:52:11 +0800
Subject: [PATCH 10/10] llama/kompute: Add multi-GPU support

Signed-off-by: Feng Jiang <jiangfeng@kylinos.cn>
---
 src/llama.cpp | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index fe3c0db6f2931..28b2ad605e51a 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2856,6 +2856,8 @@ static size_t llama_get_device_count(const llama_model & model) {
     count = ggml_backend_sycl_get_device_count();
 #elif defined(GGML_USE_VULKAN)
     count = ggml_backend_vk_get_device_count();
+#elif defined(GGML_USE_KOMPUTE)
+    count = ggml_backend_kompute_get_device_count();
 #elif defined(GGML_USE_CANN)
     return ggml_backend_cann_get_device_count();
 #endif
@@ -2952,6 +2954,11 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
     size_t free;
     ggml_backend_vk_get_device_memory(device, &free, &total);
     return free;
+#elif defined(GGML_USE_KOMPUTE)
+    size_t total;
+    size_t free;
+    ggml_backend_kompute_get_device_memory(device, &free, &total);
+    return free;
 #elif defined(GGML_USE_CANN)
     size_t total;
     size_t free;
@@ -16899,6 +16906,8 @@ size_t llama_max_devices(void) {
     return GGML_SYCL_MAX_DEVICES;
 #elif defined(GGML_USE_VULKAN)
     return GGML_VK_MAX_DEVICES;
+#elif defined(GGML_USE_KOMPUTE)
+    return GGML_KOMPUTE_MAX_DEVICES;
 #elif defined(GGML_USE_CANN)
     return GGML_CANN_MAX_DEVICES;
 #else
@@ -17234,13 +17243,35 @@ struct llama_context * llama_new_context_with_model(
         }
 #elif defined(GGML_USE_KOMPUTE)
         if (model->n_gpu_layers > 0) {
-            auto * backend = ggml_backend_kompute_init(model->main_gpu);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
+            if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
+                auto * backend = ggml_backend_kompute_init(model->main_gpu);
+                if (!backend) {
+                    LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
+                    llama_free(ctx);
+                    return nullptr;
+                }
+                ctx->backends.push_back(backend);
+            } else if (model->split_mode == LLAMA_SPLIT_MODE_LAYER) {
+                size_t count = 0;
+                auto * devices =ggml_vk_available_devices(0, &count);
+                for (size_t i = 0; i < count; i++) {
+                    LLAMA_LOG_INFO("Kompute: Found device #%d, %s, %s, max-alloc %ld, heap-size %lu\n",
+                                    devices[i].index, devices[i].vendor, devices[i].name,
+                                    devices[i].maxAlloc, devices[i].heapSize);
+                    auto * backend = ggml_backend_kompute_init(devices[i].index);
+                    if (!backend) {
+                        LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
+                        llama_free(ctx);
+                        return nullptr;
+                    }
+                    ctx->backends.push_back(backend);
+                }
+                free(devices);
+            } else {
+                LLAMA_LOG_ERROR("%s: Failed to init Kompute backend: split mode %d not supported\n", __func__, model->split_mode);
                 llama_free(ctx);
                 return nullptr;
             }
-            ctx->backends.push_back(backend);
         }
 #elif defined(GGML_USE_CANN)
     // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used