From 63eb6462975618ae0d47cc5d1ff4be6942f95b2b Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Mon, 19 May 2025 17:13:42 -0700
Subject: [PATCH 01/25] Minimal setup of webgpu backend with dawn. Just prints
 out the adapter and segfaults

---
 ggml/CMakeLists.txt                  |   3 +
 ggml/include/ggml-webgpu.h           |  18 ++++
 ggml/src/CMakeLists.txt              |   1 +
 ggml/src/ggml-backend-reg.cpp        |   7 ++
 ggml/src/ggml-webgpu/CMakeLists.txt  |  10 +++
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 118 +++++++++++++++++++++++++++
 6 files changed, 157 insertions(+)
 create mode 100644 ggml/include/ggml-webgpu.h
 create mode 100644 ggml/src/ggml-webgpu/CMakeLists.txt
 create mode 100644 ggml/src/ggml-webgpu/ggml-webgpu.cpp

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 4746d5cb76c08..e8a2a8ad77c86 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -180,6 +180,8 @@ option(GGML_VULKAN_PERF                     "ggml: enable Vulkan perf output"
 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
+option(GGML_WEBGPU                          "ggml: use WebGPU"                                OFF)
+option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
@@ -270,6 +272,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-rpc.h
     include/ggml-sycl.h
     include/ggml-vulkan.h
+    include/ggml-webgpu.h
     include/gguf.h)
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
diff --git a/ggml/include/ggml-webgpu.h b/ggml/include/ggml-webgpu.h
new file mode 100644
index 0000000000000..93d5329adeb9b
--- /dev/null
+++ b/ggml/include/ggml-webgpu.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_WEBGPU_NAME "WebGPU"
+
+GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index ddea5ad3891e5..0a8f4dca518e0 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -312,6 +312,7 @@ ggml_add_backend(MUSA)
 ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
+ggml_add_backend(WebGPU)
 ggml_add_backend(OpenCL)
 
 foreach (target ggml-base ggml)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 405d8e31514b5..da8d70dbaf2ee 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -45,6 +45,10 @@
 #include "ggml-vulkan.h"
 #endif
 
+#ifdef GGML_USE_WEBGPU
+#include "ggml-webgpu.h"
+#endif
+
 #ifdef GGML_USE_OPENCL
 #include "ggml-opencl.h"
 #endif
@@ -172,6 +176,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_VULKAN
         register_backend(ggml_backend_vk_reg());
 #endif
+#ifdef GGML_USE_WEBGPU
+        register_backend(ggml_backend_webgpu_reg());
+#endif
 #ifdef GGML_USE_OPENCL
         register_backend(ggml_backend_opencl_reg());
 #endif
diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt
new file mode 100644
index 0000000000000..c8536cc7e7845
--- /dev/null
+++ b/ggml/src/ggml-webgpu/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 3.13)
+
+find_package(Dawn REQUIRED)
+
+ggml_add_backend_library(ggml-webgpu
+                         ggml-webgpu.cpp
+                         ../../include/ggml-webgpu.h
+                        )
+
+target_link_libraries(ggml-webgpu PRIVATE dawn::webgpu_dawn)
\ No newline at end of file
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
new file mode 100644
index 0000000000000..c109f3889b6ee
--- /dev/null
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -0,0 +1,118 @@
+#include "ggml-webgpu.h"
+
+#include <webgpu/webgpu_cpp.h>
+#include <webgpu/webgpu_cpp_print.h>
+
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+
+#include <iostream>
+#include <vector>
+
+static ggml_backend_i ggml_backend_webgpu_interface = {
+    /* .get_name                = */ NULL,
+    /* .free                    = */ NULL,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ NULL,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_webgpu_guid(void) {
+    static ggml_guid guid = { 0x9a, 0x5f, 0x3c, 0x2d, 0xb7, 0x1e, 0x47, 0xa1, 0x92, 0xcf, 0x16, 0x44, 0x58, 0xee, 0x90, 0x2b };
+    return &guid;
+}
+
+// necessary??
+ggml_backend_t ggml_backend_webgpu_init() {
+    ggml_backend_t webgpu_backend = new ggml_backend {
+        /* .guid      = */ ggml_backend_webgpu_guid(),
+        /* .interface = */ ggml_backend_webgpu_interface,
+        /* .device    = */ NULL,
+        /* .context   = */ NULL,
+    };
+
+    return webgpu_backend;
+}
+
+static const char * ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    return GGML_WEBGPU_NAME;
+}
+
+// Stub for now
+static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    return 1;
+}
+
+// Stub for now
+static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t reg, size_t device) {
+    static std::vector<ggml_backend_dev_t> devices;
+    return devices[device];
+
+}
+
+static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = {
+    /* .get_name         = */ ggml_backend_webgpu_reg_get_name,
+    /* .get_device_count = */ ggml_backend_webgpu_reg_get_device_count,
+    /* .get_device       = */ ggml_backend_webgpu_reg_get_device,
+    /* .get_proc_address = */ NULL,
+};
+
+ggml_backend_reg_t ggml_backend_webgpu_reg() {
+    static ggml_backend_reg reg = {
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_webgpu_reg_i,
+        /* .context     = */ nullptr,
+    };
+    // need to init webgpu here
+    wgpu::InstanceDescriptor instanceDescriptor{};
+  instanceDescriptor.capabilities.timedWaitAnyEnable = true;
+  wgpu::Instance instance = wgpu::CreateInstance(&instanceDescriptor);
+  if (instance == nullptr) {
+    std::cerr << "Instance creation failed!\n";
+    return nullptr;
+  }
+  // Synchronously request the adapter.
+  wgpu::RequestAdapterOptions options = {};
+  wgpu::Adapter adapter;
+
+  auto callback = [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char *message, void *userdata) {
+    if (status != wgpu::RequestAdapterStatus::Success) {
+      std::cerr << "Failed to get an adapter:" << message;
+      return;
+    }
+    *static_cast<wgpu::Adapter *>(userdata) = adapter;
+  };
+
+
+  auto callbackMode = wgpu::CallbackMode::WaitAnyOnly;
+  void *userdata = &adapter;
+  instance.WaitAny(instance.RequestAdapter(&options, callbackMode, callback, userdata), UINT64_MAX);
+  if (adapter == nullptr) {
+    std::cerr << "RequestAdapter failed!\n";
+    return nullptr;
+  }
+
+  wgpu::DawnAdapterPropertiesPowerPreference power_props{};
+
+  wgpu::AdapterInfo info{};
+  info.nextInChain = &power_props;
+
+  adapter.GetInfo(&info);
+  std::cout << "VendorID: " << std::hex << info.vendorID << std::dec << "\n";
+  std::cout << "Vendor: " << info.vendor << "\n";
+  std::cout << "Architecture: " << info.architecture << "\n";
+  std::cout << "DeviceID: " << std::hex << info.deviceID << std::dec << "\n";
+  std::cout << "Name: " << info.device << "\n";
+  std::cout << "Driver description: " << info.description << "\n";
+    return &reg;
+}

From c0a810e816b79ac25d28583547e0a436dc3fbc19 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 20 May 2025 12:18:10 -0700
Subject: [PATCH 02/25] Initialize webgpu device

---
 ggml/src/ggml-webgpu/CMakeLists.txt  |   8 +-
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 119 ++++++++++++++++++---------
 2 files changed, 86 insertions(+), 41 deletions(-)

diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt
index c8536cc7e7845..c72b17c82a828 100644
--- a/ggml/src/ggml-webgpu/CMakeLists.txt
+++ b/ggml/src/ggml-webgpu/CMakeLists.txt
@@ -7,4 +7,10 @@ ggml_add_backend_library(ggml-webgpu
                          ../../include/ggml-webgpu.h
                         )
 
-target_link_libraries(ggml-webgpu PRIVATE dawn::webgpu_dawn)
\ No newline at end of file
+if (GGML_WEBGPU_DEBUG)
+    message(STATUS "GGML_WEBGPU_DEBUG is ON")
+    target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1)
+endif()
+
+target_link_libraries(ggml-webgpu PRIVATE dawn::webgpu_dawn)
+
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index c109f3889b6ee..f340f56940af2 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -9,6 +9,84 @@
 #include <iostream>
 #include <vector>
 
+#ifdef GGML_WEBGPU_DEBUG
+#define WEBGPU_LOG_DEBUG(msg) std::cout << msg << std::endl
+#else
+#define WEBGPU_LOG_DEBUG(msg) ((void) 0)
+#endif // GGML_WEBGPU_DEBUG
+
+
+struct webgpu_context {
+    wgpu::Instance instance;
+    // an adapter can only be used to create one device
+    wgpu::Adapter adapter;
+    // we only support one device for now
+    wgpu::Device device;
+};
+
+static bool webgpu_context_initialized = false;
+static webgpu_context webgpu_ctx;
+
+static void ggml_webgpu_context_init() {
+    if (webgpu_context_initialized) {
+        return;
+    }
+    WEBGPU_LOG_DEBUG("ggml_webgpu_context_init()");
+
+    wgpu::InstanceDescriptor instanceDescriptor{};
+    instanceDescriptor.capabilities.timedWaitAnyEnable = true;
+    webgpu_ctx.instance = wgpu::CreateInstance(&instanceDescriptor);
+    GGML_ASSERT(webgpu_ctx.instance != nullptr);
+
+    wgpu::RequestAdapterOptions options = {};
+    wgpu::Adapter adapter;
+
+    auto callback = [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char *message, void *userdata) {
+        if (status != wgpu::RequestAdapterStatus::Success) {
+            GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
+            return;
+        }
+        *static_cast<wgpu::Adapter *>(userdata) = adapter;
+    };
+
+    auto callbackMode = wgpu::CallbackMode::WaitAnyOnly;
+    void *userdata = &webgpu_ctx.adapter;
+    webgpu_ctx.instance.WaitAny(webgpu_ctx.instance.RequestAdapter(&options, callbackMode, callback, userdata), UINT64_MAX);
+    GGML_ASSERT(webgpu_ctx.adapter != nullptr);
+
+    wgpu::DeviceDescriptor deviceDescriptor;
+    deviceDescriptor.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, 
+        [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
+            GGML_UNUSED(device);
+            GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
+    });
+    deviceDescriptor.SetUncapturedErrorCallback(
+        [](const wgpu::Device& device, wgpu::ErrorType reason, wgpu::StringView message) {
+            GGML_UNUSED(device);
+            GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
+    });
+    webgpu_ctx.instance.WaitAny(webgpu_ctx.adapter.RequestDevice(&deviceDescriptor, callbackMode,
+        [](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
+            if (status != wgpu::RequestDeviceStatus::Success) {
+                GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data);
+                return;
+            }
+            webgpu_ctx.device = std::move(device);
+        }),
+        UINT64_MAX
+    );
+    GGML_ASSERT(webgpu_ctx.device != nullptr);
+
+    wgpu::DawnAdapterPropertiesPowerPreference power_props{};
+    wgpu::AdapterInfo info{};
+    info.nextInChain = &power_props;
+    webgpu_ctx.adapter.GetInfo(&info);
+    GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n", 
+        info.vendorID, info.vendor.data, info.architecture.data, info.deviceID, info.device.data, info.description.data);
+
+    webgpu_context_initialized = true;
+}
+
 static ggml_backend_i ggml_backend_webgpu_interface = {
     /* .get_name                = */ NULL,
     /* .free                    = */ NULL,
@@ -74,45 +152,6 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
         /* .context     = */ nullptr,
     };
     // need to init webgpu here
-    wgpu::InstanceDescriptor instanceDescriptor{};
-  instanceDescriptor.capabilities.timedWaitAnyEnable = true;
-  wgpu::Instance instance = wgpu::CreateInstance(&instanceDescriptor);
-  if (instance == nullptr) {
-    std::cerr << "Instance creation failed!\n";
-    return nullptr;
-  }
-  // Synchronously request the adapter.
-  wgpu::RequestAdapterOptions options = {};
-  wgpu::Adapter adapter;
-
-  auto callback = [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char *message, void *userdata) {
-    if (status != wgpu::RequestAdapterStatus::Success) {
-      std::cerr << "Failed to get an adapter:" << message;
-      return;
-    }
-    *static_cast<wgpu::Adapter *>(userdata) = adapter;
-  };
-
-
-  auto callbackMode = wgpu::CallbackMode::WaitAnyOnly;
-  void *userdata = &adapter;
-  instance.WaitAny(instance.RequestAdapter(&options, callbackMode, callback, userdata), UINT64_MAX);
-  if (adapter == nullptr) {
-    std::cerr << "RequestAdapter failed!\n";
-    return nullptr;
-  }
-
-  wgpu::DawnAdapterPropertiesPowerPreference power_props{};
-
-  wgpu::AdapterInfo info{};
-  info.nextInChain = &power_props;
-
-  adapter.GetInfo(&info);
-  std::cout << "VendorID: " << std::hex << info.vendorID << std::dec << "\n";
-  std::cout << "Vendor: " << info.vendor << "\n";
-  std::cout << "Architecture: " << info.architecture << "\n";
-  std::cout << "DeviceID: " << std::hex << info.deviceID << std::dec << "\n";
-  std::cout << "Name: " << info.device << "\n";
-  std::cout << "Driver description: " << info.description << "\n";
+    ggml_webgpu_context_init();
     return &reg;
 }

From e50335c4c5a718b802545c557b0c1d9ac3977b22 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 20 May 2025 18:03:45 -0700
Subject: [PATCH 03/25] Making progress on setting up the backend

---
 ggml/include/ggml-webgpu.h           |   2 -
 ggml/src/ggml-webgpu/CMakeLists.txt  |   1 -
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 252 ++++++++++++++++++---------
 3 files changed, 172 insertions(+), 83 deletions(-)

diff --git a/ggml/include/ggml-webgpu.h b/ggml/include/ggml-webgpu.h
index 93d5329adeb9b..cf6cb451648e9 100644
--- a/ggml/include/ggml-webgpu.h
+++ b/ggml/include/ggml-webgpu.h
@@ -9,8 +9,6 @@ extern "C" {
 
 #define GGML_WEBGPU_NAME "WebGPU"
 
-GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void);
-
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void);
 
 #ifdef  __cplusplus
diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt
index c72b17c82a828..27d4472ea65fd 100644
--- a/ggml/src/ggml-webgpu/CMakeLists.txt
+++ b/ggml/src/ggml-webgpu/CMakeLists.txt
@@ -8,7 +8,6 @@ ggml_add_backend_library(ggml-webgpu
                         )
 
 if (GGML_WEBGPU_DEBUG)
-    message(STATUS "GGML_WEBGPU_DEBUG is ON")
     target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1)
 endif()
 
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index f340f56940af2..3518dff0ca6a9 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -15,32 +15,159 @@
 #define WEBGPU_LOG_DEBUG(msg) ((void) 0)
 #endif // GGML_WEBGPU_DEBUG
 
+// TODO: find a better way to get the memory available
+#define WEBGPU_MAX_BUFFERS 32
 
-struct webgpu_context {
+// When registering the backend, we initialize the WebGPU instance.
+struct webgpu_reg_context {
     wgpu::Instance instance;
-    // an adapter can only be used to create one device
+    size_t device_count;
+    const char * name;
+};
+
+// When getting the (ggml) device, we create a WebGPU adapter and its associated WebGPU device.
+struct webgpu_device_context {
+    // An adapter can only be used to create one device
     wgpu::Adapter adapter;
-    // we only support one device for now
     wgpu::Device device;
+    wgpu::StringView device_name;
+    wgpu::StringView device_desc;
 };
 
-static bool webgpu_context_initialized = false;
-static webgpu_context webgpu_ctx;
+struct webgpu_backend_context {
+    wgpu::Device device;
+};
 
-static void ggml_webgpu_context_init() {
-    if (webgpu_context_initialized) {
-        return;
-    }
-    WEBGPU_LOG_DEBUG("ggml_webgpu_context_init()");
+static ggml_backend_i ggml_backend_webgpu_i = {
+    /* .get_name                = */ NULL,
+    /* .free                    = */ NULL,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ NULL,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
 
-    wgpu::InstanceDescriptor instanceDescriptor{};
-    instanceDescriptor.capabilities.timedWaitAnyEnable = true;
-    webgpu_ctx.instance = wgpu::CreateInstance(&instanceDescriptor);
-    GGML_ASSERT(webgpu_ctx.instance != nullptr);
+static ggml_guid_t ggml_backend_webgpu_guid(void) {
+    static const char * guid_str = "__ggml_webgpu :)";
+    return reinterpret_cast<ggml_guid_t>((void *)guid_str);
+}
 
-    wgpu::RequestAdapterOptions options = {};
-    wgpu::Adapter adapter;
+static const char * ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) {
+    webgpu_device_context * ctx = static_cast<webgpu_device_context *>(dev->context);
+    return ctx->device_name.data;
+}
+
+static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_t dev) {
+    webgpu_device_context * ctx = static_cast<webgpu_device_context *>(dev->context);
+    return ctx->device_desc.data;
+}
+
+static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    webgpu_device_context * ctx = static_cast<webgpu_device_context *>(dev->context);
+    wgpu::Limits limits;
+    ctx->device.GetLimits(&limits);
+    // TODO: what do we actually want to return here?
+    *free = limits.maxBufferSize * WEBGPU_MAX_BUFFERS;
+    *total = limits.maxBufferSize * WEBGPU_MAX_BUFFERS;
+}
+
+static enum ggml_backend_dev_type ggml_backend_webgpu_device_get_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return GGML_BACKEND_DEVICE_TYPE_GPU;
+}
+
+static void ggml_backend_webgpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_webgpu_device_get_name(dev);
+    props->description = ggml_backend_webgpu_device_get_description(dev);
+    props->type        = ggml_backend_webgpu_device_get_type(dev);
+    ggml_backend_webgpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+        /* .async                 = */ false,
+        /* .host_buffer           = */ true, // maybe? not sure what this means yet
+        /* .buffer_from_host_ptr  = */ false,
+        /* .events                = */ false,
+    };
+}
+
+// TODO: Does this need to be thread safe? Is it only called once?
+static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
+    GGML_UNUSED(params);
+
+    webgpu_device_context * dev_ctx = static_cast<webgpu_device_context *>(dev->context);
 
+    static webgpu_backend_context backend_ctx;
+    backend_ctx.device = dev_ctx->device;
+
+    static ggml_backend backend = {
+        /* .guid      = */ ggml_backend_webgpu_guid(),
+        /* .interface = */ ggml_backend_webgpu_i,
+        /* .device    = */ dev,
+        /* .context   = */ &backend_ctx,
+    };
+    return &backend;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggml_backend_dev_t dev) {
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_webgpu = {
+        /* .iface = */ {
+            /* .get_name         = */ NULL,
+            /* .alloc_buffer     = */ NULL,
+            /* .get_alignment    = */ NULL,
+            /* .get_max_size     = */ NULL,
+            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+            /* .is_host          = */ NULL,
+        },
+        /* .device  = */ dev,
+        /* .context = */ NULL,
+    };
+    return &ggml_backend_buffer_type_webgpu;
+}
+
+static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
+    /* .get_name             = */ ggml_backend_webgpu_device_get_name,
+    /* .get_description      = */ ggml_backend_webgpu_device_get_description,
+    /* .get_memory           = */ ggml_backend_webgpu_device_get_memory,
+    /* .get_type             = */ ggml_backend_webgpu_device_get_type,
+    /* .get_props            = */ ggml_backend_webgpu_device_get_props,
+    /* .init_backend         = */ ggml_backend_webgpu_device_init,
+    /* .get_buffer_type      = */ ggml_backend_webgpu_device_get_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
+    /* .buffer_from_host_ptr = */ NULL,
+    /* .supports_op          = */ NULL,
+    /* .supports_buft        = */ NULL,
+    /* .offload_op           = */ NULL,
+    /* .event_new            = */ NULL,
+    /* .event_free           = */ NULL,
+    /* .event_synchronize    = */ NULL,
+};
+
+static const char * ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) {
+    webgpu_reg_context * ctx = static_cast<webgpu_reg_context *>(reg->context);
+    return ctx->name;
+}
+
+static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) {
+    webgpu_reg_context * ctx = static_cast<webgpu_reg_context *>(reg->context);
+    return ctx->device_count;
+}
+
+// TODO: Does this need to be thread safe? Is it only called once?
+// Only one device is supported for now
+static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    GGML_ASSERT(index == 0);
+    WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()");
+
+    webgpu_reg_context * reg_ctx = static_cast<webgpu_reg_context *>(reg->context);
+    static webgpu_device_context device_ctx;
+
+    wgpu::RequestAdapterOptions options = {};
     auto callback = [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char *message, void *userdata) {
         if (status != wgpu::RequestAdapterStatus::Success) {
             GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
@@ -48,11 +175,10 @@ static void ggml_webgpu_context_init() {
         }
         *static_cast<wgpu::Adapter *>(userdata) = adapter;
     };
-
     auto callbackMode = wgpu::CallbackMode::WaitAnyOnly;
-    void *userdata = &webgpu_ctx.adapter;
-    webgpu_ctx.instance.WaitAny(webgpu_ctx.instance.RequestAdapter(&options, callbackMode, callback, userdata), UINT64_MAX);
-    GGML_ASSERT(webgpu_ctx.adapter != nullptr);
+    void *userdata = &device_ctx.adapter;
+    reg_ctx->instance.WaitAny(reg_ctx->instance.RequestAdapter(&options, callbackMode, callback, userdata), UINT64_MAX);
+    GGML_ASSERT(device_ctx.adapter != nullptr);
 
     wgpu::DeviceDescriptor deviceDescriptor;
     deviceDescriptor.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, 
@@ -65,77 +191,31 @@ static void ggml_webgpu_context_init() {
             GGML_UNUSED(device);
             GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
     });
-    webgpu_ctx.instance.WaitAny(webgpu_ctx.adapter.RequestDevice(&deviceDescriptor, callbackMode,
+    reg_ctx->instance.WaitAny(device_ctx.adapter.RequestDevice(&deviceDescriptor, callbackMode,
         [](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
             if (status != wgpu::RequestDeviceStatus::Success) {
                 GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data);
                 return;
             }
-            webgpu_ctx.device = std::move(device);
+            device_ctx.device = std::move(device);
         }),
         UINT64_MAX
     );
-    GGML_ASSERT(webgpu_ctx.device != nullptr);
+    GGML_ASSERT(device_ctx.device != nullptr);
 
-    wgpu::DawnAdapterPropertiesPowerPreference power_props{};
     wgpu::AdapterInfo info{};
-    info.nextInChain = &power_props;
-    webgpu_ctx.adapter.GetInfo(&info);
+    device_ctx.adapter.GetInfo(&info);
+    device_ctx.device_name = info.device;
+    device_ctx.device_desc = info.description;
     GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n", 
         info.vendorID, info.vendor.data, info.architecture.data, info.deviceID, info.device.data, info.description.data);
 
-    webgpu_context_initialized = true;
-}
-
-static ggml_backend_i ggml_backend_webgpu_interface = {
-    /* .get_name                = */ NULL,
-    /* .free                    = */ NULL,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ NULL,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
-};
-
-static ggml_guid_t ggml_backend_webgpu_guid(void) {
-    static ggml_guid guid = { 0x9a, 0x5f, 0x3c, 0x2d, 0xb7, 0x1e, 0x47, 0xa1, 0x92, 0xcf, 0x16, 0x44, 0x58, 0xee, 0x90, 0x2b };
-    return &guid;
-}
-
-// necessary??
-ggml_backend_t ggml_backend_webgpu_init() {
-    ggml_backend_t webgpu_backend = new ggml_backend {
-        /* .guid      = */ ggml_backend_webgpu_guid(),
-        /* .interface = */ ggml_backend_webgpu_interface,
-        /* .device    = */ NULL,
-        /* .context   = */ NULL,
+    static ggml_backend_device device = {
+        /* .iface   = */ ggml_backend_webgpu_device_i,
+        /* .reg     = */ reg,
+        /* .context = */ &device_ctx,
     };
-
-    return webgpu_backend;
-}
-
-static const char * ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) {
-    GGML_UNUSED(reg);
-    return GGML_WEBGPU_NAME;
-}
-
-// Stub for now
-static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) {
-    GGML_UNUSED(reg);
-    return 1;
-}
-
-// Stub for now
-static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t reg, size_t device) {
-    static std::vector<ggml_backend_dev_t> devices;
-    return devices[device];
-
+    return &device;
 }
 
 static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = {
@@ -145,13 +225,25 @@ static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = {
     /* .get_proc_address = */ NULL,
 };
 
+// TODO: Does this need to be thread safe? Is it only called once?
 ggml_backend_reg_t ggml_backend_webgpu_reg() {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg()");
+
+    static webgpu_reg_context ctx;
+    ctx.name = GGML_WEBGPU_NAME;
+    ctx.device_count = 1;
+
+    wgpu::InstanceDescriptor instanceDescriptor{};
+    instanceDescriptor.capabilities.timedWaitAnyEnable = true;
+    ctx.instance = wgpu::CreateInstance(&instanceDescriptor);
+    GGML_ASSERT(ctx.instance != nullptr);
+
     static ggml_backend_reg reg = {
         /* .api_version = */ GGML_BACKEND_API_VERSION,
         /* .iface       = */ ggml_backend_webgpu_reg_i,
-        /* .context     = */ nullptr,
+        /* .context     = */ &ctx,
     };
-    // need to init webgpu here
-    ggml_webgpu_context_init();
     return &reg;
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_webgpu_reg)
\ No newline at end of file

From b17b1645c9de7af1cc7abccf0afac707d2ef6ed4 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 21 May 2025 10:47:44 -0700
Subject: [PATCH 04/25] Finish more boilerplate/utility functions

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 159 ++++++++++++++++++++-------
 1 file changed, 120 insertions(+), 39 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 3518dff0ca6a9..aa56e26a4ae2d 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -19,14 +19,14 @@
 #define WEBGPU_MAX_BUFFERS 32
 
 // When registering the backend, we initialize the WebGPU instance.
-struct webgpu_reg_context {
+struct ggml_backend_webgpu_reg_context {
     wgpu::Instance instance;
     size_t device_count;
     const char * name;
 };
 
 // When getting the (ggml) device, we create a WebGPU adapter and its associated WebGPU device.
-struct webgpu_device_context {
+struct ggml_backend_webgpu_device_context {
     // An adapter can only be used to create one device
     wgpu::Adapter adapter;
     wgpu::Device device;
@@ -34,43 +34,28 @@ struct webgpu_device_context {
     wgpu::StringView device_desc;
 };
 
-struct webgpu_backend_context {
+struct ggml_backend_webgpu_context {
+    std::string name;
     wgpu::Device device;
 };
 
-static ggml_backend_i ggml_backend_webgpu_i = {
-    /* .get_name                = */ NULL,
-    /* .free                    = */ NULL,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ NULL,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
-};
-
 static ggml_guid_t ggml_backend_webgpu_guid(void) {
     static const char * guid_str = "__ggml_webgpu :)";
     return reinterpret_cast<ggml_guid_t>((void *)guid_str);
 }
 
 static const char * ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) {
-    webgpu_device_context * ctx = static_cast<webgpu_device_context *>(dev->context);
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
     return ctx->device_name.data;
 }
 
 static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_t dev) {
-    webgpu_device_context * ctx = static_cast<webgpu_device_context *>(dev->context);
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
     return ctx->device_desc.data;
 }
 
 static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    webgpu_device_context * ctx = static_cast<webgpu_device_context *>(dev->context);
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
     wgpu::Limits limits;
     ctx->device.GetLimits(&limits);
     // TODO: what do we actually want to return here?
@@ -96,13 +81,44 @@ static void ggml_backend_webgpu_device_get_props(ggml_backend_dev_t dev, struct
     };
 }
 
+static const char * ggml_backend_webgpu_name(ggml_backend_t backend) {
+    ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context;
+    return ctx->name.c_str();
+}
+
+static void ggml_backend_webgpu_free(ggml_backend_t backend) {
+    ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context;
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")");
+
+    // TODO: cleanup
+}
+
+static ggml_backend_i ggml_backend_webgpu_i = {
+    /* .get_name                = */ ggml_backend_webgpu_name,
+    /* .free                    = */ ggml_backend_webgpu_free,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL, // TODO
+    /* .graph_compute           = */ NULL,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
 // TODO: Does this need to be thread safe? Is it only called once?
 static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
     GGML_UNUSED(params);
 
-    webgpu_device_context * dev_ctx = static_cast<webgpu_device_context *>(dev->context);
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_init()");
 
-    static webgpu_backend_context backend_ctx;
+    ggml_backend_webgpu_device_context * dev_ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
+
+    static ggml_backend_webgpu_context backend_ctx;
+    backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + std::string(dev_ctx->device_name.data);
     backend_ctx.device = dev_ctx->device;
 
     static ggml_backend backend = {
@@ -114,20 +130,85 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
     return &backend;
 }
 
+static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
+    return ctx->device_name.data;
+}
+
+static size_t ggml_backend_webgpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
+    wgpu::Limits limits;
+    ctx->device.GetLimits(&limits);
+    return limits.minStorageBufferOffsetAlignment;
+}
+
+static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
+    wgpu::Limits limits;
+    ctx->device.GetLimits(&limits);
+    return limits.maxBufferSize;
+}
+
 static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggml_backend_dev_t dev) {
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_webgpu = {
+    static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type = {
         /* .iface = */ {
-            /* .get_name         = */ NULL,
-            /* .alloc_buffer     = */ NULL,
-            /* .get_alignment    = */ NULL,
-            /* .get_max_size     = */ NULL,
+            /* .get_name         = */ ggml_backend_webgpu_buffer_type_get_name,
+            /* .alloc_buffer     = */ NULL, // TODO
+            /* .get_alignment    = */ ggml_backend_webgpu_buffer_type_get_alignment,
+            /* .get_max_size     = */ ggml_backend_webgpu_buffer_type_get_max_size,
             /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
-            /* .is_host          = */ NULL,
+            /* .is_host          = */ NULL, // defaults to false
         },
         /* .device  = */ dev,
         /* .context = */ NULL,
     };
-    return &ggml_backend_buffer_type_webgpu;
+
+    return &ggml_backend_webgpu_buffer_type;
+}
+
+static const char * ggml_backend_webgpu_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    return GGML_WEBGPU_NAME "_Host";
+}
+
+// WebGPU doesn't specify a memory map alignment like Vulkan, so we use the same value as the storage buffer alignment
+static size_t ggml_backend_webgpu_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
+    wgpu::Limits limits;
+    ctx->device.GetLimits(&limits);
+    return limits.minStorageBufferOffsetAlignment;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+    static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type_host = {
+        /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_webgpu_host_buffer_type_name,
+            /* .alloc_buffer     = */ NULL, // TODO
+            /* .get_alignment    = */ ggml_backend_webgpu_host_buffer_type_get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+        },
+        /* .device   = */ dev,
+        /* .context  = */ NULL,
+    };
+
+    return &ggml_backend_webgpu_buffer_type_host;
+}
+
+static bool ggml_backend_webgpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(dev);
+    return  buft->iface.get_name == ggml_backend_webgpu_buffer_type_get_name;
+}
+
+static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
+    GGML_UNUSED(dev);
+
+    // what should we support first?
+    switch (op->op) {
+        default:
+            return false;
+    }
 }
 
 static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
@@ -138,10 +219,10 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
     /* .get_props            = */ ggml_backend_webgpu_device_get_props,
     /* .init_backend         = */ ggml_backend_webgpu_device_init,
     /* .get_buffer_type      = */ ggml_backend_webgpu_device_get_buffer_type,
-    /* .get_host_buffer_type = */ NULL,
+    /* .get_host_buffer_type = */ ggml_backend_webgpu_device_get_host_buffer_type,
     /* .buffer_from_host_ptr = */ NULL,
-    /* .supports_op          = */ NULL,
-    /* .supports_buft        = */ NULL,
+    /* .supports_op          = */ ggml_backend_webgpu_device_supports_op,
+    /* .supports_buft        = */ ggml_backend_webgpu_device_supports_buft,
     /* .offload_op           = */ NULL,
     /* .event_new            = */ NULL,
     /* .event_free           = */ NULL,
@@ -149,12 +230,12 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
 };
 
 static const char * ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) {
-    webgpu_reg_context * ctx = static_cast<webgpu_reg_context *>(reg->context);
+    ggml_backend_webgpu_reg_context * ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
     return ctx->name;
 }
 
 static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) {
-    webgpu_reg_context * ctx = static_cast<webgpu_reg_context *>(reg->context);
+    ggml_backend_webgpu_reg_context * ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
     return ctx->device_count;
 }
 
@@ -164,8 +245,8 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     GGML_ASSERT(index == 0);
     WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()");
 
-    webgpu_reg_context * reg_ctx = static_cast<webgpu_reg_context *>(reg->context);
-    static webgpu_device_context device_ctx;
+    ggml_backend_webgpu_reg_context * reg_ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
+    static ggml_backend_webgpu_device_context device_ctx;
 
     wgpu::RequestAdapterOptions options = {};
     auto callback = [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char *message, void *userdata) {
@@ -229,7 +310,7 @@ static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = {
 ggml_backend_reg_t ggml_backend_webgpu_reg() {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg()");
 
-    static webgpu_reg_context ctx;
+    static ggml_backend_webgpu_reg_context ctx;
     ctx.name = GGML_WEBGPU_NAME;
     ctx.device_count = 1;
 

From e7071d14f381a172a57eab1f02c455d0c3694cbd Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 21 May 2025 12:13:03 -0700
Subject: [PATCH 05/25] Organize file and work on alloc buffer

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 242 ++++++++++++++++++++-------
 1 file changed, 181 insertions(+), 61 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index aa56e26a4ae2d..e675a115fadb8 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -18,6 +18,11 @@
 // TODO: find a better way to get the memory available
 #define WEBGPU_MAX_BUFFERS 32
 
+// TODO: copied from Vulkan for now, not sure what it's used for
+static void * const webgpu_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
+
+/* Struct definitions */
+
 // When registering the backend, we initialize the WebGPU instance.
 struct ggml_backend_webgpu_reg_context {
     wgpu::Instance instance;
@@ -39,11 +44,169 @@ struct ggml_backend_webgpu_context {
     wgpu::Device device;
 };
 
-static ggml_guid_t ggml_backend_webgpu_guid(void) {
-    static const char * guid_str = "__ggml_webgpu :)";
-    return reinterpret_cast<ggml_guid_t>((void *)guid_str);
+struct ggml_backend_webgpu_buffer_context {
+    wgpu::Buffer buffer;
+
+    ggml_backend_webgpu_buffer_context(wgpu::Buffer buf) :
+        buffer(buf) {
+    }
+};
+
+/* End struct definitions */
+
+/** GGML Backend Interface */
+
+static const char * ggml_backend_webgpu_name(ggml_backend_t backend) {
+    ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context;
+    return ctx->name.c_str();
+}
+
+static void ggml_backend_webgpu_free(ggml_backend_t backend) {
+    ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context;
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")");
+
+    // TODO: cleanup
+}
+
+static ggml_backend_i ggml_backend_webgpu_i = {
+    /* .get_name                = */ ggml_backend_webgpu_name,
+    /* .free                    = */ ggml_backend_webgpu_free,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_update       = */ NULL,
+    /* .graph_plan_compute      = */ NULL, // TODO
+    /* .graph_compute           = */ NULL,
+    /* .event_record            = */ NULL,
+    /* .event_wait              = */ NULL,
+};
+
+/* End GGML Backend Interface */
+
+/* GGML Backend Buffer Interface */
+
+// TODO
+static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    GGML_UNUSED(buffer);
+}
+
+// TODO: what to return here?
+static void * ggml_backend_webgpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+    GGML_UNUSED(buffer);
+    return webgpu_ptr_base;
+}
+
+// TODO
+static enum ggml_status ggml_backend_webgpu_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
+    return GGML_STATUS_SUCCESS;
+}
+
+// TODO
+static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")");
+}
+
+// TODO
+static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
+}
+
+// TODO
+static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    WEBGPU_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
+}
+
+// TODO
+static bool ggml_backend_webgpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
+    GGML_UNUSED(buffer);
+    GGML_UNUSED(src);
+    GGML_UNUSED(dst);
+
+    return true;
+}
+
+// TODO
+static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    GGML_UNUSED(buffer);
+    GGML_UNUSED(value);
+}
+
+static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_webgpu_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_webgpu_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_webgpu_buffer_init_tensor,
+    /* .memset_tensor   = */ ggml_backend_webgpu_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_webgpu_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_webgpu_buffer_get_tensor,
+    /* .cpy_tensor      = */ ggml_backend_webgpu_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_webgpu_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+/* End GGML Backend Buffer Interface */
+
+/* GGML Backend Buffer Type Interface */
+
+static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
+    return ctx->device_name.data;
 }
 
+static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_alloc_buffer(" << size << ")");
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
+
+    wgpu::BufferDescriptor buf_desc;
+    buf_desc.mappedAtCreation = false;
+    buf_desc.size = size;
+    buf_desc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
+    // TODO: error handling
+    wgpu::Buffer buf = ctx->device.CreateBuffer(&buf_desc);
+
+    ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(buf);
+
+    return ggml_backend_buffer_init(buft, ggml_backend_webgpu_buffer_interface, buf_ctx, size);
+}
+
+static size_t ggml_backend_webgpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
+    wgpu::Limits limits;
+    ctx->device.GetLimits(&limits);
+    return limits.minStorageBufferOffsetAlignment;
+}
+
+static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
+    wgpu::Limits limits;
+    ctx->device.GetLimits(&limits);
+    return limits.maxBufferSize;
+}
+
+/* End GGML Backend Buffer Type Interface */
+
+/* GGML Backend Host Buffer Type Interface */
+
+static const char * ggml_backend_webgpu_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    return GGML_WEBGPU_NAME "_Host";
+}
+
+// WebGPU doesn't specify a memory map alignment like Vulkan, so we use the same value as the storage buffer alignment
+static size_t ggml_backend_webgpu_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
+    wgpu::Limits limits;
+    ctx->device.GetLimits(&limits);
+    return limits.minStorageBufferOffsetAlignment;
+}
+
+/* End GGML Backend Host Buffer Type Interface */
+
+/* GGML Backend Device Interface */
+
 static const char * ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) {
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
     return ctx->device_name.data;
@@ -81,35 +244,13 @@ static void ggml_backend_webgpu_device_get_props(ggml_backend_dev_t dev, struct
     };
 }
 
-static const char * ggml_backend_webgpu_name(ggml_backend_t backend) {
-    ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context;
-    return ctx->name.c_str();
-}
-
-static void ggml_backend_webgpu_free(ggml_backend_t backend) {
-    ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context;
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")");
-
-    // TODO: cleanup
+static ggml_guid_t ggml_backend_webgpu_guid(void) {
+    static const char * guid_str = "__ggml_webgpu :)";
+    return reinterpret_cast<ggml_guid_t>((void *)guid_str);
 }
 
-static ggml_backend_i ggml_backend_webgpu_i = {
-    /* .get_name                = */ ggml_backend_webgpu_name,
-    /* .free                    = */ ggml_backend_webgpu_free,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ NULL, // TODO
-    /* .graph_compute           = */ NULL,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
-};
-
 // TODO: Does this need to be thread safe? Is it only called once?
+// Implementation in GGML Backend Interface section
 static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
     GGML_UNUSED(params);
 
@@ -130,30 +271,12 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
     return &backend;
 }
 
-static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
-    return ctx->device_name.data;
-}
-
-static size_t ggml_backend_webgpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
-    wgpu::Limits limits;
-    ctx->device.GetLimits(&limits);
-    return limits.minStorageBufferOffsetAlignment;
-}
-
-static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
-    wgpu::Limits limits;
-    ctx->device.GetLimits(&limits);
-    return limits.maxBufferSize;
-}
-
+// Implementation in GGML Backend Buffer Type Interface section
 static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggml_backend_dev_t dev) {
     static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type = {
         /* .iface = */ {
             /* .get_name         = */ ggml_backend_webgpu_buffer_type_get_name,
-            /* .alloc_buffer     = */ NULL, // TODO
+            /* .alloc_buffer     = */ ggml_backend_webgpu_buffer_type_alloc_buffer,
             /* .get_alignment    = */ ggml_backend_webgpu_buffer_type_get_alignment,
             /* .get_max_size     = */ ggml_backend_webgpu_buffer_type_get_max_size,
             /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
@@ -166,19 +289,8 @@ static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggm
     return &ggml_backend_webgpu_buffer_type;
 }
 
-static const char * ggml_backend_webgpu_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
-    return GGML_WEBGPU_NAME "_Host";
-}
-
-// WebGPU doesn't specify a memory map alignment like Vulkan, so we use the same value as the storage buffer alignment
-static size_t ggml_backend_webgpu_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
-    wgpu::Limits limits;
-    ctx->device.GetLimits(&limits);
-    return limits.minStorageBufferOffsetAlignment;
-}
 
+// Implementation in GGML Backend Host Buffer Type Interface
 static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_host_buffer_type(ggml_backend_dev_t dev) {
     static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type_host = {
         /* .iface    = */ {
@@ -229,6 +341,10 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
     /* .event_synchronize    = */ NULL,
 };
 
+/* End GGML Backend Device Interface */
+
+/* GGML Backend Registration Interface */
+
 static const char * ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) {
     ggml_backend_webgpu_reg_context * ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
     return ctx->name;
@@ -241,6 +357,7 @@ static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) {
 
 // TODO: Does this need to be thread safe? Is it only called once?
 // Only one device is supported for now
+// Implementation in GGML Backend Device Interface section
 static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
     GGML_ASSERT(index == 0);
     WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()");
@@ -299,6 +416,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     return &device;
 }
 
+
 static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = {
     /* .get_name         = */ ggml_backend_webgpu_reg_get_name,
     /* .get_device_count = */ ggml_backend_webgpu_reg_get_device_count,
@@ -306,6 +424,8 @@ static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = {
     /* .get_proc_address = */ NULL,
 };
 
+/* End GGML Backend Registration Interface */
+
 // TODO: Does this need to be thread safe? Is it only called once?
 ggml_backend_reg_t ggml_backend_webgpu_reg() {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg()");

From c9a53d27550c025249f6122567919a02517816fd Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 21 May 2025 17:42:43 -0700
Subject: [PATCH 06/25] Add webgpu_context to prepare for actually running some
 shaders

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 159 +++++++++++++++------------
 1 file changed, 86 insertions(+), 73 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index e675a115fadb8..6213e694ed326 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -23,32 +23,47 @@ static void * const webgpu_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
 
 /* Struct definitions */
 
-// When registering the backend, we initialize the WebGPU instance.
-struct ggml_backend_webgpu_reg_context {
+// All the base objects needed to run operations on a WebGPU device
+struct webgpu_context_struct {
     wgpu::Instance instance;
+    wgpu::Adapter adapter;
+    wgpu::Device device;
+    // TODO: initialize
+    wgpu::Queue queue;
+    wgpu::Limits limits;
+    // TODO: initialize
+    wgpu::ComputePipeline memset_pipeline;
+};
+
+typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
+
+struct ggml_backend_webgpu_reg_context {
+    webgpu_context webgpu_ctx;
+
     size_t device_count;
     const char * name;
 };
 
-// When getting the (ggml) device, we create a WebGPU adapter and its associated WebGPU device.
 struct ggml_backend_webgpu_device_context {
-    // An adapter can only be used to create one device
-    wgpu::Adapter adapter;
-    wgpu::Device device;
-    wgpu::StringView device_name;
-    wgpu::StringView device_desc;
+    webgpu_context webgpu_ctx;
+
+    std::string device_name;
+    std::string device_desc;
 };
 
 struct ggml_backend_webgpu_context {
+    webgpu_context webgpu_ctx;
+
     std::string name;
-    wgpu::Device device;
 };
 
 struct ggml_backend_webgpu_buffer_context {
+    webgpu_context webgpu_ctx;
+
     wgpu::Buffer buffer;
 
-    ggml_backend_webgpu_buffer_context(wgpu::Buffer buf) :
-        buffer(buf) {
+    ggml_backend_webgpu_buffer_context(webgpu_context ctx, wgpu::Buffer buf) :
+        webgpu_ctx(ctx), buffer(buf) {
     }
 };
 
@@ -88,9 +103,10 @@ static ggml_backend_i ggml_backend_webgpu_i = {
 
 /* GGML Backend Buffer Interface */
 
-// TODO
 static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    GGML_UNUSED(buffer);
+    ggml_backend_webgpu_buffer_context * ctx = static_cast<ggml_backend_webgpu_buffer_context *>(buffer->context);
+    ctx->buffer.Destroy();
+    delete ctx;
 }
 
 // TODO: what to return here?
@@ -99,12 +115,6 @@ static void * ggml_backend_webgpu_buffer_get_base(ggml_backend_buffer_t buffer)
     return webgpu_ptr_base;
 }
 
-// TODO
-static enum ggml_status ggml_backend_webgpu_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")");
-    return GGML_STATUS_SUCCESS;
-}
-
 // TODO
 static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")");
@@ -117,7 +127,7 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
 
 // TODO
 static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    WEBGPU_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
 }
 
 // TODO
@@ -138,7 +148,7 @@ static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8
 static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = {
     /* .free_buffer     = */ ggml_backend_webgpu_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_webgpu_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_webgpu_buffer_init_tensor,
+    /* .init_tensor     = */ NULL, // TODO: should we implement this?
     /* .memset_tensor   = */ ggml_backend_webgpu_buffer_memset_tensor,
     /* .set_tensor      = */ ggml_backend_webgpu_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_webgpu_buffer_get_tensor,
@@ -153,7 +163,7 @@ static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = {
 
 static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
-    return ctx->device_name.data;
+    return ctx->device_name.c_str();
 }
 
 static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@@ -165,25 +175,21 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b
     buf_desc.size = size;
     buf_desc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
     // TODO: error handling
-    wgpu::Buffer buf = ctx->device.CreateBuffer(&buf_desc);
+    wgpu::Buffer buf = ctx->webgpu_ctx->device.CreateBuffer(&buf_desc);
 
-    ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(buf);
+    ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf);
 
     return ggml_backend_buffer_init(buft, ggml_backend_webgpu_buffer_interface, buf_ctx, size);
 }
 
 static size_t ggml_backend_webgpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
-    wgpu::Limits limits;
-    ctx->device.GetLimits(&limits);
-    return limits.minStorageBufferOffsetAlignment;
+    return ctx->webgpu_ctx->limits.minStorageBufferOffsetAlignment;
 }
 
 static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
-    wgpu::Limits limits;
-    ctx->device.GetLimits(&limits);
-    return limits.maxBufferSize;
+    return ctx->webgpu_ctx->limits.maxBufferSize;
 }
 
 /* End GGML Backend Buffer Type Interface */
@@ -198,9 +204,7 @@ static const char * ggml_backend_webgpu_host_buffer_type_name(ggml_backend_buffe
 // WebGPU doesn't specify a memory map alignment like Vulkan, so we use the same value as the storage buffer alignment
 static size_t ggml_backend_webgpu_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
-    wgpu::Limits limits;
-    ctx->device.GetLimits(&limits);
-    return limits.minStorageBufferOffsetAlignment;
+    return ctx->webgpu_ctx->limits.minStorageBufferOffsetAlignment;
 }
 
 /* End GGML Backend Host Buffer Type Interface */
@@ -209,21 +213,19 @@ static size_t ggml_backend_webgpu_host_buffer_type_get_alignment(ggml_backend_bu
 
 static const char * ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) {
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
-    return ctx->device_name.data;
+    return ctx->device_name.c_str();
 }
 
 static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_t dev) {
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
-    return ctx->device_desc.data;
+    return ctx->device_desc.c_str();
 }
 
 static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
-    wgpu::Limits limits;
-    ctx->device.GetLimits(&limits);
     // TODO: what do we actually want to return here?
-    *free = limits.maxBufferSize * WEBGPU_MAX_BUFFERS;
-    *total = limits.maxBufferSize * WEBGPU_MAX_BUFFERS;
+    *free = ctx->webgpu_ctx->limits.maxBufferSize * WEBGPU_MAX_BUFFERS;
+    *total = ctx->webgpu_ctx->limits.maxBufferSize * WEBGPU_MAX_BUFFERS;
 }
 
 static enum ggml_backend_dev_type ggml_backend_webgpu_device_get_type(ggml_backend_dev_t dev) {
@@ -258,9 +260,33 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
 
     ggml_backend_webgpu_device_context * dev_ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
 
+    wgpu::DeviceDescriptor deviceDescriptor;
+    deviceDescriptor.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, 
+        [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
+            GGML_UNUSED(device);
+            GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
+    });
+    deviceDescriptor.SetUncapturedErrorCallback(
+        [](const wgpu::Device& device, wgpu::ErrorType reason, wgpu::StringView message) {
+            GGML_UNUSED(device);
+            GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
+    });
+    webgpu_context webgpu_ctx = dev_ctx->webgpu_ctx;
+    dev_ctx->webgpu_ctx->instance.WaitAny(dev_ctx->webgpu_ctx->adapter.RequestDevice(&deviceDescriptor, wgpu::CallbackMode::WaitAnyOnly,
+        [webgpu_ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
+            if (status != wgpu::RequestDeviceStatus::Success) {
+                GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data);
+                return;
+            }
+            webgpu_ctx->device = device;
+        }),
+        UINT64_MAX
+    );
+    GGML_ASSERT(dev_ctx->webgpu_ctx->device != nullptr);
+
     static ggml_backend_webgpu_context backend_ctx;
-    backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + std::string(dev_ctx->device_name.data);
-    backend_ctx.device = dev_ctx->device;
+    backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + dev_ctx->device_name;
+    backend_ctx.webgpu_ctx = dev_ctx->webgpu_ctx;
 
     static ggml_backend backend = {
         /* .guid      = */ ggml_backend_webgpu_guid(),
@@ -363,7 +389,8 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()");
 
     ggml_backend_webgpu_reg_context * reg_ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
-    static ggml_backend_webgpu_device_context device_ctx;
+
+    webgpu_context ctx = reg_ctx->webgpu_ctx;
 
     wgpu::RequestAdapterOptions options = {};
     auto callback = [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char *message, void *userdata) {
@@ -373,38 +400,20 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
         }
         *static_cast<wgpu::Adapter *>(userdata) = adapter;
     };
-    auto callbackMode = wgpu::CallbackMode::WaitAnyOnly;
-    void *userdata = &device_ctx.adapter;
-    reg_ctx->instance.WaitAny(reg_ctx->instance.RequestAdapter(&options, callbackMode, callback, userdata), UINT64_MAX);
-    GGML_ASSERT(device_ctx.adapter != nullptr);
+    void *userdata = &ctx->adapter;
+    ctx->instance.WaitAny(ctx->instance.RequestAdapter(&options, wgpu::CallbackMode::WaitAnyOnly, callback, userdata), UINT64_MAX);
+    GGML_ASSERT(ctx->adapter != nullptr);
 
-    wgpu::DeviceDescriptor deviceDescriptor;
-    deviceDescriptor.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, 
-        [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
-            GGML_UNUSED(device);
-            GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
-    });
-    deviceDescriptor.SetUncapturedErrorCallback(
-        [](const wgpu::Device& device, wgpu::ErrorType reason, wgpu::StringView message) {
-            GGML_UNUSED(device);
-            GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
-    });
-    reg_ctx->instance.WaitAny(device_ctx.adapter.RequestDevice(&deviceDescriptor, callbackMode,
-        [](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
-            if (status != wgpu::RequestDeviceStatus::Success) {
-                GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data);
-                return;
-            }
-            device_ctx.device = std::move(device);
-        }),
-        UINT64_MAX
-    );
-    GGML_ASSERT(device_ctx.device != nullptr);
+    ctx->adapter.GetLimits(&ctx->limits);
 
     wgpu::AdapterInfo info{};
-    device_ctx.adapter.GetInfo(&info);
-    device_ctx.device_name = info.device;
-    device_ctx.device_desc = info.description;
+    ctx->adapter.GetInfo(&info);
+
+    static ggml_backend_webgpu_device_context device_ctx;
+    device_ctx.webgpu_ctx = ctx;
+    device_ctx.device_name = std::string(info.device.data);
+    device_ctx.device_desc = std::string(info.description.data);
+
     GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n", 
         info.vendorID, info.vendor.data, info.architecture.data, info.deviceID, info.device.data, info.description.data);
 
@@ -430,14 +439,18 @@ static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = {
 ggml_backend_reg_t ggml_backend_webgpu_reg() {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg()");
 
+    webgpu_context webgpu_ctx = std::make_shared<webgpu_context_struct>();
+
     static ggml_backend_webgpu_reg_context ctx;
+    ctx.webgpu_ctx = webgpu_ctx;
     ctx.name = GGML_WEBGPU_NAME;
     ctx.device_count = 1;
 
+
     wgpu::InstanceDescriptor instanceDescriptor{};
     instanceDescriptor.capabilities.timedWaitAnyEnable = true;
-    ctx.instance = wgpu::CreateInstance(&instanceDescriptor);
-    GGML_ASSERT(ctx.instance != nullptr);
+    webgpu_ctx->instance = wgpu::CreateInstance(&instanceDescriptor);
+    GGML_ASSERT(webgpu_ctx->instance != nullptr);
 
     static ggml_backend_reg reg = {
         /* .api_version = */ GGML_BACKEND_API_VERSION,

From 9e0c6111756e55e43137c9998fb0da5300017c3f Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 4 Jun 2025 17:11:37 -0700
Subject: [PATCH 07/25] Work on memset and add shader loading

---
 ggml/src/ggml-webgpu/CMakeLists.txt           | 38 +++++++++++++++++--
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 11 +++++-
 .../ggml-webgpu/wgsl-shaders/embed_wgsl.py    | 32 ++++++++++++++++
 ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl | 38 +++++++++++++++++++
 4 files changed, 113 insertions(+), 6 deletions(-)
 create mode 100755 ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl

diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt
index 27d4472ea65fd..1ce496b391afe 100644
--- a/ggml/src/ggml-webgpu/CMakeLists.txt
+++ b/ggml/src/ggml-webgpu/CMakeLists.txt
@@ -1,15 +1,45 @@
 cmake_minimum_required(VERSION 3.13)
 
 find_package(Dawn REQUIRED)
+find_package(Python3 REQUIRED)
+
+# Shader locations
+set(SHADER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/wgsl-shaders")
+set(SHADER_OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/generated")
+set(SHADER_HEADER "${SHADER_OUTPUT_DIR}/ggml-wgsl-shaders.hpp")
+file(MAKE_DIRECTORY ${SHADER_OUTPUT_DIR})
+
+message(STATUS "Shader output dir: ${SHADER_OUTPUT_DIR}")
+
+# Find all WGSL files
+file(GLOB WGSL_SHADER_FILES "${SHADER_DIR}/*.wgsl")
+
+# Generate the header using a Python script
+add_custom_command(
+    OUTPUT ${SHADER_HEADER}
+    COMMAND ${CMAKE_COMMAND} -E echo "Embedding WGSL shaders to ggml-wgsl-shaders.hpp"
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${SHADER_OUTPUT_DIR}
+    COMMAND ${CMAKE_COMMAND} -E env PYTHONIOENCODING=utf-8
+        ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/wgsl-shaders/embed_wgsl.py
+            --input "${SHADER_DIR}"
+            --output "${SHADER_HEADER}"
+    DEPENDS ${WGSL_SHADER_FILES} ${CMAKE_CURRENT_SOURCE_DIR}/wgsl-shaders/embed_wgsl.py
+    VERBATIM
+)
+
+add_custom_target(generate_shaders DEPENDS ${SHADER_HEADER})
 
 ggml_add_backend_library(ggml-webgpu
-                         ggml-webgpu.cpp
-                         ../../include/ggml-webgpu.h
-                        )
+    ggml-webgpu.cpp
+    ${SHADER_HEADER}
+    ../../include/ggml-webgpu.h
+)
+
+add_dependencies(ggml-webgpu generate_shaders)
 
 if (GGML_WEBGPU_DEBUG)
     target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1)
 endif()
 
+target_include_directories(ggml-webgpu PRIVATE ${SHADER_OUTPUT_DIR})
 target_link_libraries(ggml-webgpu PRIVATE dawn::webgpu_dawn)
-
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 6213e694ed326..2b4286b47f57c 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -6,6 +6,8 @@
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 
+#include "ggml-wgsl-shaders.hpp"
+
 #include <iostream>
 #include <vector>
 
@@ -28,7 +30,6 @@ struct webgpu_context_struct {
     wgpu::Instance instance;
     wgpu::Adapter adapter;
     wgpu::Device device;
-    // TODO: initialize
     wgpu::Queue queue;
     wgpu::Limits limits;
     // TODO: initialize
@@ -260,6 +261,7 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
 
     ggml_backend_webgpu_device_context * dev_ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
 
+    // Initialize device
     wgpu::DeviceDescriptor deviceDescriptor;
     deviceDescriptor.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, 
         [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
@@ -272,7 +274,7 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
             GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
     });
     webgpu_context webgpu_ctx = dev_ctx->webgpu_ctx;
-    dev_ctx->webgpu_ctx->instance.WaitAny(dev_ctx->webgpu_ctx->adapter.RequestDevice(&deviceDescriptor, wgpu::CallbackMode::WaitAnyOnly,
+    webgpu_ctx->instance.WaitAny(webgpu_ctx->adapter.RequestDevice(&deviceDescriptor, wgpu::CallbackMode::WaitAnyOnly,
         [webgpu_ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
             if (status != wgpu::RequestDeviceStatus::Success) {
                 GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data);
@@ -284,6 +286,10 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
     );
     GGML_ASSERT(dev_ctx->webgpu_ctx->device != nullptr);
 
+    // Initialize (compute) queue
+    dev_ctx->webgpu_ctx->queue = dev_ctx->webgpu_ctx->device.GetQueue();
+    
+
     static ggml_backend_webgpu_context backend_ctx;
     backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + dev_ctx->device_name;
     backend_ctx.webgpu_ctx = dev_ctx->webgpu_ctx;
@@ -294,6 +300,7 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
         /* .device    = */ dev,
         /* .context   = */ &backend_ctx,
     };
+
     return &backend;
 }
 
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
new file mode 100755
index 0000000000000..b3470fef9483f
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
@@ -0,0 +1,32 @@
+import os
+import argparse
+
+def to_c_array(name, data):
+    varname = f"wgsl_{name}"
+    byte_array = ', '.join(f'0x{b:02x}' for b in data)
+    return f"""\
+const unsigned char {varname}[] = {{
+    {byte_array}
+}};
+const unsigned int {varname}_len = sizeof({varname});
+"""
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input', required=True, help='Input directory with .wgsl files')
+    parser.add_argument('--output', required=True, help='Output .hpp file path')
+    args = parser.parse_args()
+
+    with open(args.output, 'w', encoding='utf-8') as out:
+        out.write("// Auto-generated WGSL header\n\n")
+        for fname in sorted(os.listdir(args.input)):
+            if fname.endswith('.wgsl'):
+                path = os.path.join(args.input, fname)
+                varname = os.path.splitext(fname)[0]
+                with open(path, 'rb') as f:
+                    data = f.read()
+                out.write(to_c_array(varname, data))
+                out.write('\n')
+
+if __name__ == '__main__':
+    main()
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl
new file mode 100644
index 0000000000000..7a76640b1ad0f
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl
@@ -0,0 +1,38 @@
+// memset.wgsl
+@group(0) @binding(0)
+var<storage, read_write> output_buffer: array<u32>;
+
+struct Params {
+    offset: u32, // in bytes
+    size: u32,   // in bytes
+    value: u32,  // four identical values
+};
+
+@group(0) @binding(1)
+var<uniform> params: Params;
+
+// TODO: figure out workgroup size
+@compute @workgroup_size(64)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let i = gid.x * 4u;
+    let start = params.offset;
+    let end = params.offset + params.size;
+
+    // Each thread writes one u32 (4 bytes)
+    let byte_index = start + i;
+    if (byte_index + 4u <= end) {
+        output_buffer[(byte_index >> 2u)] = params.value;
+    } else {
+        // Handle tail (unaligned)
+        for (var j: u32 = 0u; j < 4u; j = j + 1u) {
+            let idx = byte_index + j;
+            if (idx < end) {
+                let word_idx = idx >> 2u;
+                let byte_offset = (idx & 3u) * 8u;
+                let mask = ~(0xffu << byte_offset);
+                let existing = output_buffer[word_idx];
+                output_buffer[word_idx] = (existing & mask) | (params.value & 0xffu) << byte_offset;
+            }
+        }
+    }
+}

From 520f59562d1d513d6e7b53e94ef4256c5b886bc7 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Thu, 5 Jun 2025 13:47:02 -0700
Subject: [PATCH 08/25] Work on memset polyfill

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 164 ++++++++++++++++--
 .../ggml-webgpu/wgsl-shaders/embed_wgsl.py    |  37 ++--
 ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl |  35 ++--
 3 files changed, 182 insertions(+), 54 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 2b4286b47f57c..849803a3384cf 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -20,9 +20,17 @@
 // TODO: find a better way to get the memory available
 #define WEBGPU_MAX_BUFFERS 32
 
-// TODO: copied from Vulkan for now, not sure what it's used for
+// This is a "fake" base pointer, since WebGPU buffers do not have pointers to their locations.
 static void * const webgpu_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
 
+// Always returns the base offset of a tensor, regardless of views.
+static uint64_t webgpu_tensor_offset(const ggml_tensor * tensor) {
+    if (tensor->view_src) {
+        return (uint8_t *) tensor->view_src->data - (uint8_t *) webgpu_ptr_base;
+    }
+    return (uint8_t *) tensor->data - (uint8_t *) webgpu_ptr_base;
+}
+
 /* Struct definitions */
 
 // All the base objects needed to run operations on a WebGPU device
@@ -32,8 +40,12 @@ struct webgpu_context_struct {
     wgpu::Device device;
     wgpu::Queue queue;
     wgpu::Limits limits;
-    // TODO: initialize
+
+    // memset pipeline and parameter buffers
     wgpu::ComputePipeline memset_pipeline;
+    wgpu::Buffer memset_params_dev_buf;
+    wgpu::Buffer memset_params_host_buf;
+    size_t memset_elems_per_thread;
 };
 
 typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
@@ -70,6 +82,40 @@ struct ggml_backend_webgpu_buffer_context {
 
 /* End struct definitions */
 
+/* WebGPU object initializations */
+
+static void ggml_webgpu_create_pipeline(wgpu::Device &device, wgpu::ComputePipeline &pipeline, const char * shader_code, const std::vector<wgpu::ConstantEntry> &constants = {}) {
+    WEBGPU_LOG_DEBUG("ggml_webgpu_create_pipeline()");
+    wgpu::ShaderSourceWGSL shader_source;
+    shader_source.code = shader_code;
+    wgpu::ShaderModuleDescriptor shader_desc;
+    shader_desc.nextInChain = &shader_source;
+    wgpu::ShaderModule shader_module = device.CreateShaderModule(&shader_desc);
+
+    wgpu::ComputePipelineDescriptor pipeline_desc;
+    pipeline_desc.compute.module = shader_module;
+    pipeline_desc.compute.entryPoint = "main"; // Entry point in the WGSL code
+    pipeline_desc.layout = nullptr; // Guessing that nullptr means auto layout
+    if (constants.size() > 0) {
+        pipeline_desc.compute.constants = constants.data();
+        pipeline_desc.compute.constantCount = constants.size();
+    }
+    pipeline = device.CreateComputePipeline(&pipeline_desc);
+}
+
+static void ggml_webgpu_create_buffer(wgpu::Device &device, wgpu::Buffer &buffer, size_t size, wgpu::BufferUsage usage) {
+    WEBGPU_LOG_DEBUG("ggml_webgpu_create_buffer()");
+
+    wgpu::BufferDescriptor buffer_desc;
+    buffer_desc.size = size;
+    buffer_desc.usage = usage;
+    buffer_desc.mappedAtCreation = false; 
+    // TODO: error handling
+    buffer = device.CreateBuffer(&buffer_desc);
+}
+
+/** End WebGPU object initializations */
+
 /** GGML Backend Interface */
 
 static const char * ggml_backend_webgpu_name(ggml_backend_t backend) {
@@ -110,15 +156,77 @@ static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer)
     delete ctx;
 }
 
-// TODO: what to return here?
+// Returns the "fake" base pointer.
 static void * ggml_backend_webgpu_buffer_get_base(ggml_backend_buffer_t buffer) {
     GGML_UNUSED(buffer);
     return webgpu_ptr_base;
 }
 
-// TODO
 static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
+    if (size == 0) {
+        WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor: size is zero, nothing to do.");
+        return;
+    }
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")");
+    ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
+    webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx;
+    wgpu::Device device = webgpu_ctx->device;
+
+    // map the host parameters buffer
+    webgpu_ctx->instance.WaitAny(webgpu_ctx->memset_params_host_buf.MapAsync(
+        wgpu::MapMode::Write, 0, webgpu_ctx->memset_params_host_buf.GetSize(), wgpu::CallbackMode::WaitAnyOnly,
+        [](wgpu::MapAsyncStatus status, wgpu::StringView message) {
+            if (status != wgpu::MapAsyncStatus::Success) {
+                GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n", message.data);
+            }
+        }),
+        UINT64_MAX
+    );
+
+    // Set the host parameter buffer 
+    uint32_t * params = (uint32_t *)webgpu_ctx->memset_params_host_buf.GetMappedRange();
+    size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
+    uint32_t val32 = (uint32_t)value * 0x01010101;
+    params[0] = (uint32_t)total_offset;
+    params[1] = (uint32_t)size; 
+    params[2] = val32;
+    webgpu_ctx->memset_params_host_buf.Unmap();
+
+    // buffer to memset
+    wgpu::Buffer buf = buf_ctx->buffer;
+
+    wgpu::BindGroupEntry entries[2];
+    entries[0].binding = 0; // binding for the buffer to memset
+    entries[0].buffer = buf;
+    entries[0].offset = 0;
+    entries[0].size = buf.GetSize();
+    entries[1].binding = 1; // binding for the parameters
+    entries[1].buffer = webgpu_ctx->memset_params_dev_buf;
+    entries[1].offset = 0;
+    entries[1].size = webgpu_ctx->memset_params_dev_buf.GetSize();
+
+    wgpu::BindGroupDescriptor bind_group_desc;
+    bind_group_desc.layout = webgpu_ctx->memset_pipeline.GetBindGroupLayout(0);
+    bind_group_desc.entryCount = 2;
+    bind_group_desc.entries = entries;
+    wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc);
+
+    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+    encoder.CopyBufferToBuffer(
+        webgpu_ctx->memset_params_host_buf, 0, 
+        webgpu_ctx->memset_params_dev_buf, 0, 
+        webgpu_ctx->memset_params_dev_buf.GetSize()
+    );
+    wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+    pass.SetPipeline(webgpu_ctx->memset_pipeline);
+    pass.SetBindGroup(0, bind_group);
+    size_t elems_per_wg = webgpu_ctx->limits.maxComputeWorkgroupSizeX * webgpu_ctx->memset_elems_per_thread;
+    pass.DispatchWorkgroups((((size + 3)/4) + elems_per_wg - 1) / elems_per_wg, 1, 1);
+    pass.End();
+    wgpu::CommandBuffer commands = encoder.Finish();
+
+    // async, do we need to wait on this?
+    webgpu_ctx->queue.Submit(1, &commands);
 }
 
 // TODO
@@ -171,12 +279,9 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_alloc_buffer(" << size << ")");
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
 
-    wgpu::BufferDescriptor buf_desc;
-    buf_desc.mappedAtCreation = false;
-    buf_desc.size = size;
-    buf_desc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst;
-    // TODO: error handling
-    wgpu::Buffer buf = ctx->webgpu_ctx->device.CreateBuffer(&buf_desc);
+    wgpu::Buffer buf;
+    ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf, size, 
+        wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst);
 
     ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf);
 
@@ -188,9 +293,10 @@ static size_t ggml_backend_webgpu_buffer_type_get_alignment(ggml_backend_buffer_
     return ctx->webgpu_ctx->limits.minStorageBufferOffsetAlignment;
 }
 
+// maxBufferSize might be larger, but you can't bind more than maxStorageBufferBindingSize to a single binding.
 static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
-    return ctx->webgpu_ctx->limits.maxBufferSize;
+    return ctx->webgpu_ctx->limits.maxStorageBufferBindingSize;
 }
 
 /* End GGML Backend Buffer Type Interface */
@@ -252,8 +358,26 @@ static ggml_guid_t ggml_backend_webgpu_guid(void) {
     return reinterpret_cast<ggml_guid_t>((void *)guid_str);
 }
 
+static void ggml_webgpu_init_memset_pipeline(webgpu_context webgpu_ctx) {
+    // we use the maximum workgroup size for the memset pipeline
+    size_t max_wg_size = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
+    size_t max_threads = max_wg_size * webgpu_ctx->limits.maxComputeWorkgroupsPerDimension;
+    // Size the elems_per_thread so that the largest buffer size can be handled
+    webgpu_ctx->memset_elems_per_thread = (webgpu_ctx->limits.maxStorageBufferBindingSize / sizeof(uint32_t) + max_threads - 1) / max_threads;
+    std::vector<wgpu::ConstantEntry> constants(2);
+    constants[0].key = "wg_size";
+    constants[0].value = max_wg_size;
+    constants[1].key = "elems_per_thread";
+    constants[1].value = webgpu_ctx->memset_elems_per_thread;
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->memset_pipeline, wgsl_memset, constants);
+    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_dev_buf, 
+        3 * sizeof(uint32_t), // 3 parameters: buffer size, offset, value
+        wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst);
+    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_host_buf,
+        3 * sizeof(uint32_t), wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc);
+}
+
 // TODO: Does this need to be thread safe? Is it only called once?
-// Implementation in GGML Backend Interface section
 static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
     GGML_UNUSED(params);
 
@@ -284,16 +408,18 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
         }),
         UINT64_MAX
     );
-    GGML_ASSERT(dev_ctx->webgpu_ctx->device != nullptr);
+    GGML_ASSERT(webgpu_ctx->device != nullptr);
 
     // Initialize (compute) queue
-    dev_ctx->webgpu_ctx->queue = dev_ctx->webgpu_ctx->device.GetQueue();
-    
+    webgpu_ctx->queue = webgpu_ctx->device.GetQueue();
+
+    ggml_webgpu_init_memset_pipeline(webgpu_ctx);
 
     static ggml_backend_webgpu_context backend_ctx;
     backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + dev_ctx->device_name;
-    backend_ctx.webgpu_ctx = dev_ctx->webgpu_ctx;
+    backend_ctx.webgpu_ctx = webgpu_ctx;
 
+    // See GGML Backend Interface section
     static ggml_backend backend = {
         /* .guid      = */ ggml_backend_webgpu_guid(),
         /* .interface = */ ggml_backend_webgpu_i,
@@ -304,8 +430,8 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
     return &backend;
 }
 
-// Implementation in GGML Backend Buffer Type Interface section
 static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggml_backend_dev_t dev) {
+    // See GGML Backend Buffer Type Interface section
     static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type = {
         /* .iface = */ {
             /* .get_name         = */ ggml_backend_webgpu_buffer_type_get_name,
@@ -323,8 +449,8 @@ static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggm
 }
 
 
-// Implementation in GGML Backend Host Buffer Type Interface
 static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+    // See GGML Backend Host Buffer Type Interface section
     static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type_host = {
         /* .iface    = */ {
             /* .get_name         = */ ggml_backend_webgpu_host_buffer_type_name,
@@ -390,7 +516,6 @@ static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) {
 
 // TODO: Does this need to be thread safe? Is it only called once?
 // Only one device is supported for now
-// Implementation in GGML Backend Device Interface section
 static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
     GGML_ASSERT(index == 0);
     WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()");
@@ -424,6 +549,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n", 
         info.vendorID, info.vendor.data, info.architecture.data, info.deviceID, info.device.data, info.description.data);
 
+    // See GGML Backend Device Interface section
     static ggml_backend_device device = {
         /* .iface   = */ ggml_backend_webgpu_device_i,
         /* .reg     = */ reg,
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
index b3470fef9483f..daec8fe87dfda 100755
--- a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
@@ -1,32 +1,31 @@
 import os
 import argparse
 
-def to_c_array(name, data):
-    varname = f"wgsl_{name}"
-    byte_array = ', '.join(f'0x{b:02x}' for b in data)
-    return f"""\
-const unsigned char {varname}[] = {{
-    {byte_array}
-}};
-const unsigned int {varname}_len = sizeof({varname});
-"""
+def escape_triple_quotes(wgsl):
+    # Simple defense in case of embedded """
+    return wgsl.replace('"""', '\\"""')
+
+def to_cpp_string_literal(varname, content):
+    return f'const char* wgsl_{varname} = R"({content})";\n'
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--input', required=True, help='Input directory with .wgsl files')
-    parser.add_argument('--output', required=True, help='Output .hpp file path')
+    parser.add_argument('--input', required=True)
+    parser.add_argument('--output', required=True)
     args = parser.parse_args()
 
     with open(args.output, 'w', encoding='utf-8') as out:
-        out.write("// Auto-generated WGSL header\n\n")
+        out.write("// Auto-generated shader embedding \n\n")
         for fname in sorted(os.listdir(args.input)):
-            if fname.endswith('.wgsl'):
-                path = os.path.join(args.input, fname)
-                varname = os.path.splitext(fname)[0]
-                with open(path, 'rb') as f:
-                    data = f.read()
-                out.write(to_c_array(varname, data))
-                out.write('\n')
+            if not fname.endswith('.wgsl'):
+                continue
+            shader_path = os.path.join(args.input, fname)
+            varname = os.path.splitext(fname)[0]
+            with open(shader_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            content = escape_triple_quotes(content)
+            out.write(to_cpp_string_literal(varname, content))
+            out.write('\n')
 
 if __name__ == '__main__':
     main()
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl
index 7a76640b1ad0f..32c0b2b67d5a2 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl
@@ -11,27 +11,30 @@ struct Params {
 @group(0) @binding(1)
 var<uniform> params: Params;
 
-// TODO: figure out workgroup size
-@compute @workgroup_size(64)
+override wg_size: u32;
+override elems_per_thread: u32;
+
+@compute @workgroup_size(wg_size)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let i = gid.x * 4u;
     let start = params.offset;
     let end = params.offset + params.size;
 
-    // Each thread writes one u32 (4 bytes)
-    let byte_index = start + i;
-    if (byte_index + 4u <= end) {
-        output_buffer[(byte_index >> 2u)] = params.value;
-    } else {
-        // Handle tail (unaligned)
-        for (var j: u32 = 0u; j < 4u; j = j + 1u) {
-            let idx = byte_index + j;
-            if (idx < end) {
-                let word_idx = idx >> 2u;
-                let byte_offset = (idx & 3u) * 8u;
-                let mask = ~(0xffu << byte_offset);
-                let existing = output_buffer[word_idx];
-                output_buffer[word_idx] = (existing & mask) | (params.value & 0xffu) << byte_offset;
+    for (var j: u32 = 0u; j < elems_per_thread; j = j + 1u) {
+        let byte_index = start + i + j;
+        if (byte_index + 4u <= end) {
+            output_buffer[(byte_index >> 2u)] = params.value;
+        } else {
+            // Handle tail (unaligned)
+            for (var k: u32 = 0u; k < 4u; k = k + 1u) {
+                let idx = byte_index + k;
+                if (idx < end) {
+                    let word_idx = idx >> 2u;
+                    let byte_offset = (idx & 3u) * 8u;
+                    let mask = ~(0xffu << byte_offset);
+                    let existing = output_buffer[word_idx];
+                    output_buffer[word_idx] = (existing & mask) | ((params.value & 0xffu) << byte_offset);
+                }
             }
         }
     }

From 2d24a8ada1b08d1d3aa9548fb1e9f3e2b3c3250f Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Sun, 8 Jun 2025 20:26:37 -0700
Subject: [PATCH 09/25] Implement set_tensor as webgpu WriteBuffer, remove
 host_buffer stubs since webgpu doesn't support it

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 50 ++++++----------------------
 1 file changed, 11 insertions(+), 39 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 849803a3384cf..885a1ab9bcb98 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -225,16 +225,22 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe
     pass.End();
     wgpu::CommandBuffer commands = encoder.Finish();
 
-    // async, do we need to wait on this?
+    // TODO, async, do we need to wait on this?
     webgpu_ctx->queue.Submit(1, &commands);
 }
 
-// TODO
 static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
+    ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
+    webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx;
+
+    size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
+
+    // TODO: wait on this?
+    webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, size);
 }
 
-// TODO
+// TODO: we need a staging buffer for this, since WebGPU does not allow reading from storage buffers directly.
 static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
 }
@@ -301,21 +307,6 @@ static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_t
 
 /* End GGML Backend Buffer Type Interface */
 
-/* GGML Backend Host Buffer Type Interface */
-
-static const char * ggml_backend_webgpu_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
-    return GGML_WEBGPU_NAME "_Host";
-}
-
-// WebGPU doesn't specify a memory map alignment like Vulkan, so we use the same value as the storage buffer alignment
-static size_t ggml_backend_webgpu_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
-    return ctx->webgpu_ctx->limits.minStorageBufferOffsetAlignment;
-}
-
-/* End GGML Backend Host Buffer Type Interface */
-
 /* GGML Backend Device Interface */
 
 static const char * ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) {
@@ -347,7 +338,7 @@ static void ggml_backend_webgpu_device_get_props(ggml_backend_dev_t dev, struct
     ggml_backend_webgpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
     props->caps = {
         /* .async                 = */ false,
-        /* .host_buffer           = */ true, // maybe? not sure what this means yet
+        /* .host_buffer           = */ false,
         /* .buffer_from_host_ptr  = */ false,
         /* .events                = */ false,
     };
@@ -448,25 +439,6 @@ static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggm
     return &ggml_backend_webgpu_buffer_type;
 }
 
-
-static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_host_buffer_type(ggml_backend_dev_t dev) {
-    // See GGML Backend Host Buffer Type Interface section
-    static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type_host = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_webgpu_host_buffer_type_name,
-            /* .alloc_buffer     = */ NULL, // TODO
-            /* .get_alignment    = */ ggml_backend_webgpu_host_buffer_type_get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .device   = */ dev,
-        /* .context  = */ NULL,
-    };
-
-    return &ggml_backend_webgpu_buffer_type_host;
-}
-
 static bool ggml_backend_webgpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
     GGML_UNUSED(dev);
     return  buft->iface.get_name == ggml_backend_webgpu_buffer_type_get_name;
@@ -490,7 +462,7 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
     /* .get_props            = */ ggml_backend_webgpu_device_get_props,
     /* .init_backend         = */ ggml_backend_webgpu_device_init,
     /* .get_buffer_type      = */ ggml_backend_webgpu_device_get_buffer_type,
-    /* .get_host_buffer_type = */ ggml_backend_webgpu_device_get_host_buffer_type,
+    /* .get_host_buffer_type = */ NULL,
     /* .buffer_from_host_ptr = */ NULL,
     /* .supports_op          = */ ggml_backend_webgpu_device_supports_op,
     /* .supports_buft        = */ ggml_backend_webgpu_device_supports_buft,

From d0480cae1d07469263ab8d3e7d8c258e112c4799 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Mon, 9 Jun 2025 12:55:12 -0700
Subject: [PATCH 10/25] Implement get_tensor and buffer_clear

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 176 ++++++++++++++++-----------
 1 file changed, 107 insertions(+), 69 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 885a1ab9bcb98..a0e578751cc10 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -46,6 +46,9 @@ struct webgpu_context_struct {
     wgpu::Buffer memset_params_dev_buf;
     wgpu::Buffer memset_params_host_buf;
     size_t memset_elems_per_thread;
+
+    // Staging buffer for reading data from the GPU
+    wgpu::Buffer get_tensor_staging_buf;
 };
 
 typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
@@ -116,6 +119,71 @@ static void ggml_webgpu_create_buffer(wgpu::Device &device, wgpu::Buffer &buffer
 
 /** End WebGPU object initializations */
 
+/** WebGPU Actions */
+
+static void * ggml_backend_webgpu_map_buffer(webgpu_context ctx, wgpu::Buffer buffer, wgpu::MapMode mode, size_t offset, size_t size) {
+    ctx->instance.WaitAny(buffer.MapAsync(
+        mode, offset, size, wgpu::CallbackMode::WaitAnyOnly,
+        [](wgpu::MapAsyncStatus status, wgpu::StringView message) {
+            if (status != wgpu::MapAsyncStatus::Success) {
+                GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n", message.data);
+            }
+        }),
+        UINT64_MAX
+    );
+    return buffer.GetMappedRange();
+}
+
+static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer buf, uint8_t value, size_t offset, size_t size) {
+    wgpu::Device device = ctx->device;
+
+    // map the host parameters buffer
+    uint32_t * params = (uint32_t *)ggml_backend_webgpu_map_buffer(ctx, ctx->memset_params_host_buf, 
+        wgpu::MapMode::Write, 0, ctx->memset_params_host_buf.GetSize());
+
+    // This is a trick to set all bytes of a u32 to the same 1 byte value.
+    uint32_t val32 = (uint32_t)value * 0x01010101;
+    params[0] = (uint32_t)offset;
+    params[1] = (uint32_t)size; 
+    params[2] = val32;
+    ctx->memset_params_host_buf.Unmap();
+
+    wgpu::BindGroupEntry entries[2];
+    entries[0].binding = 0; // binding for the buffer to memset
+    entries[0].buffer = buf;
+    entries[0].offset = 0;
+    entries[0].size = buf.GetSize();
+    entries[1].binding = 1; // binding for the parameters
+    entries[1].buffer = ctx->memset_params_dev_buf;
+    entries[1].offset = 0;
+    entries[1].size = ctx->memset_params_dev_buf.GetSize();
+
+    wgpu::BindGroupDescriptor bind_group_desc;
+    bind_group_desc.layout = ctx->memset_pipeline.GetBindGroupLayout(0);
+    bind_group_desc.entryCount = 2;
+    bind_group_desc.entries = entries;
+    wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc);
+
+    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+    encoder.CopyBufferToBuffer(
+        ctx->memset_params_host_buf, 0, 
+        ctx->memset_params_dev_buf, 0, 
+        ctx->memset_params_dev_buf.GetSize()
+    );
+    wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+    pass.SetPipeline(ctx->memset_pipeline);
+    pass.SetBindGroup(0, bind_group);
+    size_t elems_per_wg = ctx->limits.maxComputeWorkgroupSizeX * ctx->memset_elems_per_thread;
+    pass.DispatchWorkgroups((((size + 3)/4) + elems_per_wg - 1) / elems_per_wg, 1, 1);
+    pass.End();
+    wgpu::CommandBuffer commands = encoder.Finish();
+
+    // TODO, async, do we need to wait on this?
+    ctx->queue.Submit(1, &commands);
+}
+
+/** End WebGPU Actions */
+
 /** GGML Backend Interface */
 
 static const char * ggml_backend_webgpu_name(ggml_backend_t backend) {
@@ -167,66 +235,12 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe
         WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor: size is zero, nothing to do.");
         return;
     }
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")");
-    ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
-    webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx;
-    wgpu::Device device = webgpu_ctx->device;
 
-    // map the host parameters buffer
-    webgpu_ctx->instance.WaitAny(webgpu_ctx->memset_params_host_buf.MapAsync(
-        wgpu::MapMode::Write, 0, webgpu_ctx->memset_params_host_buf.GetSize(), wgpu::CallbackMode::WaitAnyOnly,
-        [](wgpu::MapAsyncStatus status, wgpu::StringView message) {
-            if (status != wgpu::MapAsyncStatus::Success) {
-                GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n", message.data);
-            }
-        }),
-        UINT64_MAX
-    );
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")");
 
-    // Set the host parameter buffer 
-    uint32_t * params = (uint32_t *)webgpu_ctx->memset_params_host_buf.GetMappedRange();
+    ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
     size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
-    uint32_t val32 = (uint32_t)value * 0x01010101;
-    params[0] = (uint32_t)total_offset;
-    params[1] = (uint32_t)size; 
-    params[2] = val32;
-    webgpu_ctx->memset_params_host_buf.Unmap();
-
-    // buffer to memset
-    wgpu::Buffer buf = buf_ctx->buffer;
-
-    wgpu::BindGroupEntry entries[2];
-    entries[0].binding = 0; // binding for the buffer to memset
-    entries[0].buffer = buf;
-    entries[0].offset = 0;
-    entries[0].size = buf.GetSize();
-    entries[1].binding = 1; // binding for the parameters
-    entries[1].buffer = webgpu_ctx->memset_params_dev_buf;
-    entries[1].offset = 0;
-    entries[1].size = webgpu_ctx->memset_params_dev_buf.GetSize();
-
-    wgpu::BindGroupDescriptor bind_group_desc;
-    bind_group_desc.layout = webgpu_ctx->memset_pipeline.GetBindGroupLayout(0);
-    bind_group_desc.entryCount = 2;
-    bind_group_desc.entries = entries;
-    wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc);
-
-    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
-    encoder.CopyBufferToBuffer(
-        webgpu_ctx->memset_params_host_buf, 0, 
-        webgpu_ctx->memset_params_dev_buf, 0, 
-        webgpu_ctx->memset_params_dev_buf.GetSize()
-    );
-    wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
-    pass.SetPipeline(webgpu_ctx->memset_pipeline);
-    pass.SetBindGroup(0, bind_group);
-    size_t elems_per_wg = webgpu_ctx->limits.maxComputeWorkgroupSizeX * webgpu_ctx->memset_elems_per_thread;
-    pass.DispatchWorkgroups((((size + 3)/4) + elems_per_wg - 1) / elems_per_wg, 1, 1);
-    pass.End();
-    wgpu::CommandBuffer commands = encoder.Finish();
-
-    // TODO, async, do we need to wait on this?
-    webgpu_ctx->queue.Submit(1, &commands);
+    ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, total_offset, size);
 }
 
 static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -243,33 +257,57 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
 // TODO: we need a staging buffer for this, since WebGPU does not allow reading from storage buffers directly.
 static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
-}
 
-// TODO
-static bool ggml_backend_webgpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
-    GGML_UNUSED(buffer);
-    GGML_UNUSED(src);
-    GGML_UNUSED(dst);
+    ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
+    webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx;
+    wgpu::Device device = webgpu_ctx->device;
 
-    return true;
+    size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
+
+    if (webgpu_ctx->get_tensor_staging_buf == nullptr || 
+        webgpu_ctx->get_tensor_staging_buf.GetSize() < size) {
+        // Create a new staging buffer if it doesn't exist or is too small
+        if (webgpu_ctx->get_tensor_staging_buf) {
+            webgpu_ctx->get_tensor_staging_buf.Destroy();
+        }
+        ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, size, 
+            wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead);
+    }
+
+    // Copy the data from the buffer to the staging buffer
+    wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+    encoder.CopyBufferToBuffer(buf_ctx->buffer, total_offset, webgpu_ctx->get_tensor_staging_buf, 0, size);
+    wgpu::CommandBuffer commands = encoder.Finish();
+    // Submit the command buffer to the queue
+    webgpu_ctx->queue.Submit(1, &commands);
+
+    // Map the staging buffer to read the data
+    const void * mapped_range = ggml_backend_webgpu_map_buffer(webgpu_ctx, webgpu_ctx->get_tensor_staging_buf, 
+        wgpu::MapMode::Read, 0, size);
+
+    // Copy the data from the mapped range to the output buffer
+    std::memcpy(data, mapped_range, size);
+    webgpu_ctx->get_tensor_staging_buf.Unmap();
 }
 
 // TODO
 static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    GGML_UNUSED(buffer);
-    GGML_UNUSED(value);
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << value << ")");
+
+    ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
+    ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, 0, buf_ctx->buffer.GetSize());
 }
 
 static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = {
     /* .free_buffer     = */ ggml_backend_webgpu_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_webgpu_buffer_get_base,
-    /* .init_tensor     = */ NULL, // TODO: should we implement this?
+    /* .init_tensor     = */ NULL, // TODO: optional, needed?
     /* .memset_tensor   = */ ggml_backend_webgpu_buffer_memset_tensor,
     /* .set_tensor      = */ ggml_backend_webgpu_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_webgpu_buffer_get_tensor,
-    /* .cpy_tensor      = */ ggml_backend_webgpu_buffer_cpy_tensor,
+    /* .cpy_tensor      = */ NULL, // TODO: optional, implement this
     /* .clear           = */ ggml_backend_webgpu_buffer_clear,
-    /* .reset           = */ NULL,
+    /* .reset           = */ NULL, // TODO: optional, think it coordinates with .init_tensor
 };
 
 /* End GGML Backend Buffer Interface */

From f8a53eee3b45dacd8a1de74845edde99b57dd929 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 10 Jun 2025 17:18:11 -0700
Subject: [PATCH 11/25] Finish rest of setup

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index a0e578751cc10..ef3349d515171 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -187,6 +187,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b
 /** GGML Backend Interface */
 
 static const char * ggml_backend_webgpu_name(ggml_backend_t backend) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_name()");
     ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context;
     return ctx->name.c_str();
 }
@@ -219,6 +220,7 @@ static ggml_backend_i ggml_backend_webgpu_i = {
 /* GGML Backend Buffer Interface */
 
 static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_free_buffer()");
     ggml_backend_webgpu_buffer_context * ctx = static_cast<ggml_backend_webgpu_buffer_context *>(buffer->context);
     ctx->buffer.Destroy();
     delete ctx;
@@ -226,6 +228,7 @@ static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer)
 
 // Returns the "fake" base pointer.
 static void * ggml_backend_webgpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_base()");
     GGML_UNUSED(buffer);
     return webgpu_ptr_base;
 }
@@ -315,6 +318,7 @@ static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = {
 /* GGML Backend Buffer Type Interface */
 
 static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_get_name()");
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
     return ctx->device_name.c_str();
 }
@@ -333,12 +337,14 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b
 }
 
 static size_t ggml_backend_webgpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_get_alignment()");
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
     return ctx->webgpu_ctx->limits.minStorageBufferOffsetAlignment;
 }
 
 // maxBufferSize might be larger, but you can't bind more than maxStorageBufferBindingSize to a single binding.
 static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_get_max_size()");
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
     return ctx->webgpu_ctx->limits.maxStorageBufferBindingSize;
 }
@@ -348,16 +354,20 @@ static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_t
 /* GGML Backend Device Interface */
 
 static const char * ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_get_name()");
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
     return ctx->device_name.c_str();
 }
 
 static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_t dev) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_get_description()");
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
     return ctx->device_desc.c_str();
 }
 
 static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_get_memory()");
+
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
     // TODO: what do we actually want to return here?
     *free = ctx->webgpu_ctx->limits.maxBufferSize * WEBGPU_MAX_BUFFERS;
@@ -365,6 +375,7 @@ static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t
 }
 
 static enum ggml_backend_dev_type ggml_backend_webgpu_device_get_type(ggml_backend_dev_t dev) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_get_type()");
     GGML_UNUSED(dev);
     return GGML_BACKEND_DEVICE_TYPE_GPU;
 }
@@ -413,21 +424,22 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_init()");
 
     ggml_backend_webgpu_device_context * dev_ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
+    webgpu_context webgpu_ctx = dev_ctx->webgpu_ctx;
 
     // Initialize device
-    wgpu::DeviceDescriptor deviceDescriptor;
-    deviceDescriptor.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, 
+    wgpu::DeviceDescriptor dev_desc;
+    dev_desc.requiredLimits = &webgpu_ctx->limits;
+    dev_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, 
         [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
             GGML_UNUSED(device);
             GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
     });
-    deviceDescriptor.SetUncapturedErrorCallback(
+    dev_desc.SetUncapturedErrorCallback(
         [](const wgpu::Device& device, wgpu::ErrorType reason, wgpu::StringView message) {
             GGML_UNUSED(device);
             GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
     });
-    webgpu_context webgpu_ctx = dev_ctx->webgpu_ctx;
-    webgpu_ctx->instance.WaitAny(webgpu_ctx->adapter.RequestDevice(&deviceDescriptor, wgpu::CallbackMode::WaitAnyOnly,
+    webgpu_ctx->instance.WaitAny(webgpu_ctx->adapter.RequestDevice(&dev_desc, wgpu::CallbackMode::WaitAnyOnly,
         [webgpu_ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
             if (status != wgpu::RequestDeviceStatus::Success) {
                 GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data);
@@ -515,11 +527,13 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
 /* GGML Backend Registration Interface */
 
 static const char * ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg_get_name()");
     ggml_backend_webgpu_reg_context * ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
     return ctx->name;
 }
 
 static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg_get_device_count()");
     ggml_backend_webgpu_reg_context * ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
     return ctx->device_count;
 }

From 39d956d1eaf1cc025467278983cfea518627f3a8 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 11 Jun 2025 11:14:57 -0700
Subject: [PATCH 12/25] Start work on compute graph

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 47 +++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index ef3349d515171..3e0dde29ad379 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -199,6 +199,39 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) {
     // TODO: cleanup
 }
 
+// Returns true if node has enqueued work into the queue, false otherwise
+static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
+    if (ggml_is_empty(node)) {
+        return false;
+    }
+
+    WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")");
+
+    switch (node->op) {
+        // no-op
+        case GGML_OP_NONE:
+        // these next four ops modify the logical view of the tensor, but do not change its data
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            return false;
+        default:
+            return false;
+    }
+}
+ 
+static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_graph_compute(" << cgraph->n_nodes << " nodes)");
+
+    ggml_backend_webgpu_context * backend_ctx = static_cast<ggml_backend_webgpu_context *>(backend->context);
+    webgpu_context ctx = backend_ctx->webgpu_ctx;
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_webgpu_encode_node(ctx, cgraph->nodes[i]);
+    }
+}
+
 static ggml_backend_i ggml_backend_webgpu_i = {
     /* .get_name                = */ ggml_backend_webgpu_name,
     /* .free                    = */ ggml_backend_webgpu_free,
@@ -209,8 +242,8 @@ static ggml_backend_i ggml_backend_webgpu_i = {
     /* .graph_plan_create       = */ NULL,
     /* .graph_plan_free         = */ NULL,
     /* .graph_plan_update       = */ NULL,
-    /* .graph_plan_compute      = */ NULL, // TODO
-    /* .graph_compute           = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_webgpu_graph_compute,
     /* .event_record            = */ NULL,
     /* .event_wait              = */ NULL,
 };
@@ -228,7 +261,6 @@ static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer)
 
 // Returns the "fake" base pointer.
 static void * ggml_backend_webgpu_buffer_get_base(ggml_backend_buffer_t buffer) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_base()");
     GGML_UNUSED(buffer);
     return webgpu_ptr_base;
 }
@@ -354,13 +386,11 @@ static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_t
 /* GGML Backend Device Interface */
 
 static const char * ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_get_name()");
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
     return ctx->device_name.c_str();
 }
 
 static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_t dev) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_get_description()");
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
     return ctx->device_desc.c_str();
 }
@@ -499,6 +529,13 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
 
     // what should we support first?
     switch (op->op) {
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            return true;
+
         default:
             return false;
     }

From d036f10016e6f8ade8f72bb02c2e8bc6793c3321 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Thu, 12 Jun 2025 14:33:58 -0700
Subject: [PATCH 13/25] Basic mat mul working

---
 ggml/include/ggml-webgpu.h                    |   3 +
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 113 ++++++++++++++++--
 .../src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl |  25 ++++
 3 files changed, 132 insertions(+), 9 deletions(-)
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl

diff --git a/ggml/include/ggml-webgpu.h b/ggml/include/ggml-webgpu.h
index cf6cb451648e9..65b8ed9bb6644 100644
--- a/ggml/include/ggml-webgpu.h
+++ b/ggml/include/ggml-webgpu.h
@@ -9,6 +9,9 @@ extern "C" {
 
 #define GGML_WEBGPU_NAME "WebGPU"
 
+// Needed for examples in ggml
+GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void);
+
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void);
 
 #ifdef  __cplusplus
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 3e0dde29ad379..d0e146385b545 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -41,10 +41,15 @@ struct webgpu_context_struct {
     wgpu::Queue queue;
     wgpu::Limits limits;
 
-    // memset pipeline and parameter buffers
+    // pipelines and parameter buffers
+    // TODO: reuse params buffers for different pipelines when possible
     wgpu::ComputePipeline memset_pipeline;
     wgpu::Buffer memset_params_dev_buf;
     wgpu::Buffer memset_params_host_buf;
+    wgpu::ComputePipeline mul_mat_pipeline;
+    wgpu::Buffer mul_mat_params_dev_buf;
+    wgpu::Buffer mul_mat_params_host_buf;
+
     size_t memset_elems_per_thread;
 
     // Staging buffer for reading data from the GPU
@@ -87,7 +92,7 @@ struct ggml_backend_webgpu_buffer_context {
 
 /* WebGPU object initializations */
 
-static void ggml_webgpu_create_pipeline(wgpu::Device &device, wgpu::ComputePipeline &pipeline, const char * shader_code, const std::vector<wgpu::ConstantEntry> &constants = {}) {
+static void ggml_webgpu_create_pipeline(wgpu::Device &device, wgpu::ComputePipeline &pipeline, const char * shader_code, const char * label, const std::vector<wgpu::ConstantEntry> &constants = {}) {
     WEBGPU_LOG_DEBUG("ggml_webgpu_create_pipeline()");
     wgpu::ShaderSourceWGSL shader_source;
     shader_source.code = shader_code;
@@ -96,6 +101,7 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &device, wgpu::ComputePipel
     wgpu::ShaderModule shader_module = device.CreateShaderModule(&shader_desc);
 
     wgpu::ComputePipelineDescriptor pipeline_desc;
+    pipeline_desc.label = label;
     pipeline_desc.compute.module = shader_module;
     pipeline_desc.compute.entryPoint = "main"; // Entry point in the WGSL code
     pipeline_desc.layout = nullptr; // Guessing that nullptr means auto layout
@@ -121,7 +127,7 @@ static void ggml_webgpu_create_buffer(wgpu::Device &device, wgpu::Buffer &buffer
 
 /** WebGPU Actions */
 
-static void * ggml_backend_webgpu_map_buffer(webgpu_context ctx, wgpu::Buffer buffer, wgpu::MapMode mode, size_t offset, size_t size) {
+static void ggml_backend_webgpu_map_buffer(webgpu_context ctx, wgpu::Buffer buffer, wgpu::MapMode mode, size_t offset, size_t size) {
     ctx->instance.WaitAny(buffer.MapAsync(
         mode, offset, size, wgpu::CallbackMode::WaitAnyOnly,
         [](wgpu::MapAsyncStatus status, wgpu::StringView message) {
@@ -131,15 +137,14 @@ static void * ggml_backend_webgpu_map_buffer(webgpu_context ctx, wgpu::Buffer bu
         }),
         UINT64_MAX
     );
-    return buffer.GetMappedRange();
 }
 
 static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer buf, uint8_t value, size_t offset, size_t size) {
     wgpu::Device device = ctx->device;
 
     // map the host parameters buffer
-    uint32_t * params = (uint32_t *)ggml_backend_webgpu_map_buffer(ctx, ctx->memset_params_host_buf, 
-        wgpu::MapMode::Write, 0, ctx->memset_params_host_buf.GetSize());
+    ggml_backend_webgpu_map_buffer(ctx, ctx->memset_params_host_buf, wgpu::MapMode::Write, 0, ctx->memset_params_host_buf.GetSize());
+    uint32_t * params = (uint32_t *) ctx->memset_params_host_buf.GetMappedRange();
 
     // This is a trick to set all bytes of a u32 to the same 1 byte value.
     uint32_t val32 = (uint32_t)value * 0x01010101;
@@ -207,6 +212,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
 
     WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")");
 
+
     switch (node->op) {
         // no-op
         case GGML_OP_NONE:
@@ -216,6 +222,76 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
             return false;
+
+        // basic matrix multiplication for now, 2d tensors only
+        case GGML_OP_MUL_MAT: {
+            const ggml_tensor * src0 = node->src[0];
+            ggml_backend_webgpu_buffer_context * src0_ctx = (ggml_backend_webgpu_buffer_context *) src0->buffer->context;
+            size_t src0_offset = webgpu_tensor_offset(src0) + src0->view_offs;
+            const ggml_tensor * src1 = node->src[1];
+            ggml_backend_webgpu_buffer_context * src1_ctx = (ggml_backend_webgpu_buffer_context *) src1->buffer->context;
+            size_t src1_offset = webgpu_tensor_offset(src1) + src1->view_offs;
+            ggml_backend_webgpu_buffer_context * dst_ctx = (ggml_backend_webgpu_buffer_context *) node->buffer->context;
+
+            size_t dst_offset = webgpu_tensor_offset(node) + node->view_offs;
+
+            wgpu::Device device = ctx->device;
+
+            // map the host parameters buffer
+            ggml_backend_webgpu_map_buffer(ctx, ctx->mul_mat_params_host_buf, 
+                wgpu::MapMode::Write, 0, ctx->mul_mat_params_host_buf.GetSize());
+            uint32_t * params = (uint32_t *) ctx->mul_mat_params_host_buf.GetMappedRange();
+
+            params[0] = (uint32_t)node->ne[1]; // number of rows in result (M)
+            params[1] = (uint32_t)node->ne[0]; // number of columns in result (N)
+            params[2] = (uint32_t)src0->ne[0]; // number of columns in src0/src1 (K)
+            ctx->mul_mat_params_host_buf.Unmap();
+
+            wgpu::BindGroupEntry entries[4];
+            entries[0].binding = 0; // binding for the buffer to memset
+            entries[0].buffer = src0_ctx->buffer;
+            entries[0].offset = src0_offset;
+            entries[0].size = ggml_nbytes(src0);
+
+            entries[1].binding = 1; // binding for the buffer to memset
+            entries[1].buffer = src1_ctx->buffer;
+            entries[1].offset = src1_offset;
+            entries[1].size = ggml_nbytes(src1);
+
+            entries[2].binding = 2; // binding for the buffer to memset
+            entries[2].buffer = dst_ctx->buffer;
+            entries[2].offset = dst_offset;
+            entries[2].size = ggml_nbytes(node);
+
+            entries[3].binding = 3; // binding for the parameters
+            entries[3].buffer = ctx->mul_mat_params_dev_buf;
+            entries[3].offset = 0;
+            entries[3].size = ctx->mul_mat_params_dev_buf.GetSize();
+
+            wgpu::BindGroupDescriptor bind_group_desc;
+            bind_group_desc.layout = ctx->mul_mat_pipeline.GetBindGroupLayout(0);
+            bind_group_desc.entryCount = 4;
+            bind_group_desc.entries = entries;
+            wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc);
+
+            wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+            encoder.CopyBufferToBuffer(
+                ctx->mul_mat_params_host_buf, 0, 
+                ctx->mul_mat_params_dev_buf, 0, 
+                ctx->mul_mat_params_dev_buf.GetSize()
+            );
+            wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+            pass.SetPipeline(ctx->mul_mat_pipeline);
+            pass.SetBindGroup(0, bind_group);
+            pass.DispatchWorkgroups(node->ne[0] * node->ne[1]);
+            pass.End();
+            wgpu::CommandBuffer commands = encoder.Finish();
+
+            // TODO, don't submit here, batch submissions
+            ctx->queue.Submit(1, &commands);
+            return true;
+        }
+
         default:
             return false;
     }
@@ -230,6 +306,8 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_webgpu_encode_node(ctx, cgraph->nodes[i]);
     }
+
+    return GGML_STATUS_SUCCESS;
 }
 
 static ggml_backend_i ggml_backend_webgpu_i = {
@@ -317,8 +395,8 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
     webgpu_ctx->queue.Submit(1, &commands);
 
     // Map the staging buffer to read the data
-    const void * mapped_range = ggml_backend_webgpu_map_buffer(webgpu_ctx, webgpu_ctx->get_tensor_staging_buf, 
-        wgpu::MapMode::Read, 0, size);
+    ggml_backend_webgpu_map_buffer(webgpu_ctx, webgpu_ctx->get_tensor_staging_buf, wgpu::MapMode::Read, 0, size);
+    const void * mapped_range = webgpu_ctx->get_tensor_staging_buf.GetConstMappedRange();
 
     // Copy the data from the mapped range to the output buffer
     std::memcpy(data, mapped_range, size);
@@ -439,7 +517,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context webgpu_ctx) {
     constants[0].value = max_wg_size;
     constants[1].key = "elems_per_thread";
     constants[1].value = webgpu_ctx->memset_elems_per_thread;
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->memset_pipeline, wgsl_memset, constants);
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->memset_pipeline, wgsl_memset, "memset", constants);
     ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_dev_buf, 
         3 * sizeof(uint32_t), // 3 parameters: buffer size, offset, value
         wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst);
@@ -447,6 +525,15 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context webgpu_ctx) {
         3 * sizeof(uint32_t), wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc);
 }
 
+static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context webgpu_ctx) {
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline, wgsl_mul_mat, "mul_mat");
+    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_dev_buf,
+        3 * sizeof(uint32_t), // 3 parameters: M, N, K
+        wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst);
+    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_host_buf,
+        3 * sizeof(uint32_t), wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc);
+}
+
 // TODO: Does this need to be thread safe? Is it only called once?
 static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
     GGML_UNUSED(params);
@@ -485,6 +572,7 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
     webgpu_ctx->queue = webgpu_ctx->device.GetQueue();
 
     ggml_webgpu_init_memset_pipeline(webgpu_ctx);
+    ggml_webgpu_init_mul_mat_pipeline(webgpu_ctx);
 
     static ggml_backend_webgpu_context backend_ctx;
     backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + dev_ctx->device_name;
@@ -534,6 +622,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
+        case GGML_OP_MUL_MAT:
             return true;
 
         default:
@@ -654,4 +743,10 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
     return &reg;
 }
 
+ggml_backend_t ggml_backend_webgpu_init(void) {
+    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_webgpu_reg(), 0);
+
+    return ggml_backend_webgpu_device_init(dev, nullptr);
+}
+
 GGML_BACKEND_DL_IMPL(ggml_backend_webgpu_reg)
\ No newline at end of file
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
new file mode 100644
index 0000000000000..fd041f67cc3d8
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
@@ -0,0 +1,25 @@
+struct MulMatParams {
+    m: u32,
+    n: u32,
+    k: u32
+};
+
+@group(0) @binding(0) var<storage, read> src0: array<f32>;
+@group(0) @binding(1) var<storage, read> src1: array<f32>;
+@group(0) @binding(2) var<storage, read_write> dst: array<f32>;
+
+@group(0) @binding(3) var<uniform> params: MulMatParams;
+
+@compute @workgroup_size(64)
+fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
+    if (global_id.x >= params.m * params.n) {
+        return;
+    }
+    let row = global_id.x / params.n;
+    let col = global_id.x % params.n;
+    var sum = 0.0;
+    for (var i: u32 = 0u; i < params.k; i = i + 1u) {
+        sum = sum + src0[col * params.k + i] * src1[row * params.k + i];
+    }
+    dst[row * params.n + col] = sum;
+}
\ No newline at end of file

From b8a220761c52250fdbfcaeb36ffcb4b76d5f1921 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 17 Jun 2025 13:31:07 -0700
Subject: [PATCH 14/25] Work on emscripten build

---
 ggml/src/ggml-webgpu/CMakeLists.txt  | 13 +++++++++++--
 ggml/src/ggml-webgpu/ggml-webgpu.cpp |  1 -
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt
index 1ce496b391afe..db53384e9b442 100644
--- a/ggml/src/ggml-webgpu/CMakeLists.txt
+++ b/ggml/src/ggml-webgpu/CMakeLists.txt
@@ -1,6 +1,5 @@
 cmake_minimum_required(VERSION 3.13)
 
-find_package(Dawn REQUIRED)
 find_package(Python3 REQUIRED)
 
 # Shader locations
@@ -37,9 +36,19 @@ ggml_add_backend_library(ggml-webgpu
 
 add_dependencies(ggml-webgpu generate_shaders)
 
+if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+    set(EMDAWNWEBGPU_DIR "" CACHE PATH "Path to emdawnwebgpu_pkg")
+
+    target_compile_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
+    target_link_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")
+else()
+    find_package(Dawn REQUIRED)
+    set(DawnWebGPU_TARGET dawn::webgpu_dawn)
+endif()
+
 if (GGML_WEBGPU_DEBUG)
     target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1)
 endif()
 
 target_include_directories(ggml-webgpu PRIVATE ${SHADER_OUTPUT_DIR})
-target_link_libraries(ggml-webgpu PRIVATE dawn::webgpu_dawn)
+target_link_libraries(ggml-webgpu PRIVATE ${DawnWebGPU_TARGET})
\ No newline at end of file
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index d0e146385b545..8ddbb3fdf8dec 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -1,7 +1,6 @@
 #include "ggml-webgpu.h"
 
 #include <webgpu/webgpu_cpp.h>
-#include <webgpu/webgpu_cpp_print.h>
 
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"

From c09bfc50a5b3b288c13062e0a3922201fae98225 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 17 Jun 2025 14:19:38 -0700
Subject: [PATCH 15/25] Basic WebGPU backend instructions

---
 docs/build.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/docs/build.md b/docs/build.md
index 680b0d8398741..5f735de9ed966 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -557,6 +557,23 @@ ninja
 
 To read documentation for how to build on Android, [click here](./android.md)
 
+## WebGPU [In Progress]
+
+The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake.
+
+In the llama.cpp directory, build with CMake:
+
+```
+cmake -B build -DGGML_WEBGPU=ON
+cmake --build build --config Release
+```
+
+### Browser Support 
+
+WebGPU allows cross-platform access to the GPU from supported browsers. We utilize [Emscripten](https://emscripten.org/) to compile ggml's WebGPU backend to WebAssembly. Emscripten does not officially support WebGPU bindings yet, but Dawn currently maintains its own WebGPU bindings called emdawnwebgpu. 
+
+Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/src/emdawnwebgpu/) to download or build the emdawnwebgpu package (Note that it might be safer to build them locally, so that they stay in sync with the version of Dawn you have installed above). When building using CMake, the path to the emdawnwebgpu port file needs to be set with the flag `EMDAWNWEBGPU_DIR`.
+
 ## Notes about GPU-accelerated backends
 
 The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`.

From daa58e21fb56683d83fdb7134d75fb84586c9691 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 17 Jun 2025 15:24:04 -0700
Subject: [PATCH 16/25] Use EMSCRIPTEN flag

---
 ggml/src/ggml-webgpu/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt
index db53384e9b442..0d67f11ba5520 100644
--- a/ggml/src/ggml-webgpu/CMakeLists.txt
+++ b/ggml/src/ggml-webgpu/CMakeLists.txt
@@ -36,7 +36,7 @@ ggml_add_backend_library(ggml-webgpu
 
 add_dependencies(ggml-webgpu generate_shaders)
 
-if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
+if(EMSCRIPTEN)
     set(EMDAWNWEBGPU_DIR "" CACHE PATH "Path to emdawnwebgpu_pkg")
 
     target_compile_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py")

From 1c396a263fd41a934d1d541c736591be46759a23 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Mon, 23 Jun 2025 16:42:02 -0700
Subject: [PATCH 17/25] Work on passing ci, implement 4d tensor multiplication

---
 README.md                                     |  2 +
 ci/run.sh                                     |  7 ++
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 82 +++++++++++--------
 ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl |  2 +-
 .../src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl | 47 +++++++++--
 5 files changed, 97 insertions(+), 43 deletions(-)

diff --git a/README.md b/README.md
index 90c7364dfcba0..8aa254ef6b5fc 100644
--- a/README.md
+++ b/README.md
@@ -269,6 +269,8 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [Vulkan](docs/build.md#vulkan) | GPU |
 | [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
+| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
+
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
 
 ## Obtaining and quantizing models
diff --git a/ci/run.sh b/ci/run.sh
index 94005570511b6..8117dc152c2a0 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -16,6 +16,9 @@
 # # with VULKAN support
 # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
+# # with WebGPU support
+# GG_BUILD_WEBGPU=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
 # # with MUSA support
 # GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
@@ -81,6 +84,10 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
     CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
 fi
 
+if [ ! -z ${GG_BUILD_WEBGPU} ]; then
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
+fi
+
 if [ ! -z ${GG_BUILD_MUSA} ]; then
     # Use qy1 by default (MTT S80)
     MUSA_ARCH=${MUSA_ARCH:-21}
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 8ddbb3fdf8dec..c1e08fd0e137a 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -16,9 +16,16 @@
 #define WEBGPU_LOG_DEBUG(msg) ((void) 0)
 #endif // GGML_WEBGPU_DEBUG
 
+/* Constants */
+
 // TODO: find a better way to get the memory available
 #define WEBGPU_MAX_BUFFERS 32
 
+#define WEBGPU_MUL_MAT_WG_SIZE 64
+#define WEBGPU_MUL_MAT_PARAMS_SIZE (7 * sizeof(uint32_t)) // M, N, K, batch sizes, broadcasts
+
+/* End Constants */
+
 // This is a "fake" base pointer, since WebGPU buffers do not have pointers to their locations.
 static void * const webgpu_ptr_base = (void *)(uintptr_t) 0x1000;  // NOLINT
 
@@ -138,18 +145,16 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context ctx, wgpu::Buffer buff
     );
 }
 
-static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer buf, uint8_t value, size_t offset, size_t size) {
+static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer buf, uint32_t value, size_t offset, size_t size) {
     wgpu::Device device = ctx->device;
 
     // map the host parameters buffer
     ggml_backend_webgpu_map_buffer(ctx, ctx->memset_params_host_buf, wgpu::MapMode::Write, 0, ctx->memset_params_host_buf.GetSize());
     uint32_t * params = (uint32_t *) ctx->memset_params_host_buf.GetMappedRange();
 
-    // This is a trick to set all bytes of a u32 to the same 1 byte value.
-    uint32_t val32 = (uint32_t)value * 0x01010101;
     params[0] = (uint32_t)offset;
     params[1] = (uint32_t)size; 
-    params[2] = val32;
+    params[2] = value;
     ctx->memset_params_host_buf.Unmap();
 
     wgpu::BindGroupEntry entries[2];
@@ -191,7 +196,6 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b
 /** GGML Backend Interface */
 
 static const char * ggml_backend_webgpu_name(ggml_backend_t backend) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_name()");
     ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context;
     return ctx->name.c_str();
 }
@@ -201,6 +205,7 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")");
 
     // TODO: cleanup
+    GGML_UNUSED(ctx);
 }
 
 // Returns true if node has enqueued work into the queue, false otherwise
@@ -244,6 +249,11 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
             params[0] = (uint32_t)node->ne[1]; // number of rows in result (M)
             params[1] = (uint32_t)node->ne[0]; // number of columns in result (N)
             params[2] = (uint32_t)src0->ne[0]; // number of columns in src0/src1 (K)
+            params[3] = (uint32_t)src0->ne[2]; // batch size in dimension 2
+            params[4] = (uint32_t)src0->ne[3]; // batch size in dimension 3
+            params[5] = (uint32_t)(src1->ne[2]/src0->ne[2]); // broadcast in dimension 2
+            params[6] = (uint32_t)(src1->ne[3]/src0->ne[3]); // broadcast in dimension 3
+
             ctx->mul_mat_params_host_buf.Unmap();
 
             wgpu::BindGroupEntry entries[4];
@@ -282,7 +292,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
             wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
             pass.SetPipeline(ctx->mul_mat_pipeline);
             pass.SetBindGroup(0, bind_group);
-            pass.DispatchWorkgroups(node->ne[0] * node->ne[1]);
+            pass.DispatchWorkgroups((node->ne[0] * node->ne[1] * node->ne[2] * node->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE);
             pass.End();
             wgpu::CommandBuffer commands = encoder.Finish();
 
@@ -352,7 +362,9 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe
 
     ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
     size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
-    ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, total_offset, size);
+    // This is a trick to set all bytes of a u32 to the same 1 byte value.
+    uint32_t val32 = (uint32_t)value * 0x01010101;
+    ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, val32, total_offset, size);
 }
 
 static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -363,10 +375,21 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
     size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
 
     // TODO: wait on this?
-    webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, size);
+    webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size/4)*4);
+
+    if (size % 4 != 0) {
+        // If size is not a multiple of 4, we need to memset the remaining bytes
+        size_t remaining_size = size % 4;
+        // pack the remaining bytes into a uint32_t
+        uint32_t val32 = 0;
+        for (size_t i = 0; i < remaining_size; i++) {
+            ((uint8_t *)&val32)[i] = ((const uint8_t *)data)[size - remaining_size + i];
+        }
+        // memset the remaining bytes
+        ggml_backend_webgpu_buffer_memset(webgpu_ctx, buf_ctx->buffer, val32, total_offset + (size - remaining_size), remaining_size);
+    }
 }
 
-// TODO: we need a staging buffer for this, since WebGPU does not allow reading from storage buffers directly.
 static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")");
 
@@ -376,33 +399,39 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
 
     size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
 
+    size_t final_size = size;
+    if (size % 4 != 0) {
+        // If size is not a multiple of 4, we need to round it up to the next multiple of 4
+        final_size = size + (4 - (size % 4));
+    }
+
     if (webgpu_ctx->get_tensor_staging_buf == nullptr || 
-        webgpu_ctx->get_tensor_staging_buf.GetSize() < size) {
+        webgpu_ctx->get_tensor_staging_buf.GetSize() < final_size) {
         // Create a new staging buffer if it doesn't exist or is too small
         if (webgpu_ctx->get_tensor_staging_buf) {
             webgpu_ctx->get_tensor_staging_buf.Destroy();
         }
-        ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, size, 
+        ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, final_size, 
             wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead);
     }
 
     // Copy the data from the buffer to the staging buffer
     wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
-    encoder.CopyBufferToBuffer(buf_ctx->buffer, total_offset, webgpu_ctx->get_tensor_staging_buf, 0, size);
+    encoder.CopyBufferToBuffer(buf_ctx->buffer, total_offset, webgpu_ctx->get_tensor_staging_buf, 0, final_size);
     wgpu::CommandBuffer commands = encoder.Finish();
     // Submit the command buffer to the queue
     webgpu_ctx->queue.Submit(1, &commands);
 
     // Map the staging buffer to read the data
-    ggml_backend_webgpu_map_buffer(webgpu_ctx, webgpu_ctx->get_tensor_staging_buf, wgpu::MapMode::Read, 0, size);
-    const void * mapped_range = webgpu_ctx->get_tensor_staging_buf.GetConstMappedRange();
+    ggml_backend_webgpu_map_buffer(webgpu_ctx, webgpu_ctx->get_tensor_staging_buf, wgpu::MapMode::Read, 0, final_size);
+    // Must specify size here since the staging buffer might be larger than the tensor size
+    const void * mapped_range = webgpu_ctx->get_tensor_staging_buf.GetConstMappedRange(0, final_size);
 
     // Copy the data from the mapped range to the output buffer
     std::memcpy(data, mapped_range, size);
     webgpu_ctx->get_tensor_staging_buf.Unmap();
 }
 
-// TODO
 static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << value << ")");
 
@@ -427,7 +456,6 @@ static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = {
 /* GGML Backend Buffer Type Interface */
 
 static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_get_name()");
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
     return ctx->device_name.c_str();
 }
@@ -446,14 +474,12 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b
 }
 
 static size_t ggml_backend_webgpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_get_alignment()");
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
     return ctx->webgpu_ctx->limits.minStorageBufferOffsetAlignment;
 }
 
 // maxBufferSize might be larger, but you can't bind more than maxStorageBufferBindingSize to a single binding.
 static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_get_max_size()");
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
     return ctx->webgpu_ctx->limits.maxStorageBufferBindingSize;
 }
@@ -473,8 +499,6 @@ static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_
 }
 
 static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_get_memory()");
-
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
     // TODO: what do we actually want to return here?
     *free = ctx->webgpu_ctx->limits.maxBufferSize * WEBGPU_MAX_BUFFERS;
@@ -482,7 +506,6 @@ static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t
 }
 
 static enum ggml_backend_dev_type ggml_backend_webgpu_device_get_type(ggml_backend_dev_t dev) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_get_type()");
     GGML_UNUSED(dev);
     return GGML_BACKEND_DEVICE_TYPE_GPU;
 }
@@ -526,11 +549,10 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context webgpu_ctx) {
 
 static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context webgpu_ctx) {
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline, wgsl_mul_mat, "mul_mat");
-    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_dev_buf,
-        3 * sizeof(uint32_t), // 3 parameters: M, N, K
+    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_dev_buf, WEBGPU_MUL_MAT_PARAMS_SIZE,
         wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst);
-    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_host_buf,
-        3 * sizeof(uint32_t), wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc);
+    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_host_buf,WEBGPU_MUL_MAT_PARAMS_SIZE,
+        wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc);
 }
 
 // TODO: Does this need to be thread safe? Is it only called once?
@@ -617,13 +639,9 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
     // what should we support first?
     switch (op->op) {
         case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_MUL_MAT:
             return true;
-
+        case GGML_OP_MUL_MAT:
+            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
         default:
             return false;
     }
@@ -652,13 +670,11 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
 /* GGML Backend Registration Interface */
 
 static const char * ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg_get_name()");
     ggml_backend_webgpu_reg_context * ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
     return ctx->name;
 }
 
 static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg_get_device_count()");
     ggml_backend_webgpu_reg_context * ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
     return ctx->device_count;
 }
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl
index 32c0b2b67d5a2..60142c4fa0839 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl
@@ -5,7 +5,7 @@ var<storage, read_write> output_buffer: array<u32>;
 struct Params {
     offset: u32, // in bytes
     size: u32,   // in bytes
-    value: u32,  // four identical values
+    value: u32,  // 4 8-bit values, which are either repeating (memset_tensor) or may be separate (cleaning up unaligned set_tensor operations)
 };
 
 @group(0) @binding(1)
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
index fd041f67cc3d8..31973278ee18f 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
@@ -1,25 +1,54 @@
 struct MulMatParams {
     m: u32,
     n: u32,
-    k: u32
+    k: u32,
+    bs02: u32,
+    bs03: u32,
+    broadcast2: u32,
+    broadcast3: u32
 };
 
-@group(0) @binding(0) var<storage, read> src0: array<f32>;
-@group(0) @binding(1) var<storage, read> src1: array<f32>;
-@group(0) @binding(2) var<storage, read_write> dst: array<f32>;
+@group(0) @binding(0) var<storage, read_write> src0: array<f32>; // N rows, K columns
+@group(0) @binding(1) var<storage, read_write> src1: array<f32>; // M rows, K columns
+@group(0) @binding(2) var<storage, read_write> dst: array<f32>; // M rows, N columns
 
 @group(0) @binding(3) var<uniform> params: MulMatParams;
 
 @compute @workgroup_size(64)
 fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
-    if (global_id.x >= params.m * params.n) {
+    let total = params.m * params.n * params.bs02 * params.broadcast2 * params.bs03 * params.broadcast3;
+    if (global_id.x >= total) {
         return;
     }
-    let row = global_id.x / params.n;
-    let col = global_id.x % params.n;
+
+    let src02_stride = params.n * params.k;
+    let src03_stride = src02_stride * params.bs02;
+
+    let src12_stride = params.m * params.k;
+    let src13_stride = src12_stride * params.bs02 * params.broadcast2;
+
+    let dst2_stride = params.m * params.n;
+    let dst3_stride = dst2_stride * params.bs02 * params.broadcast2;
+
+    let dst3_idx = global_id.x / dst3_stride;
+    let src03_idx = dst3_idx / params.broadcast3; // src0 may be broadcast along the third dimension
+    let src13_idx = dst3_idx; // src1 is not broadcast
+    let dst3_rem = global_id.x % dst3_stride;
+
+    let dst2_idx = dst3_rem / dst2_stride;
+    let src02_idx = dst2_idx / params.broadcast2; // src0 may also be broadcast along the second dimension
+    let src12_idx = dst2_idx;
+
+    let dst2_rem = dst3_rem % dst2_stride; 
+
+    let row = dst2_rem / params.n; // output row
+    let col = dst2_rem % params.n; // output column
+
     var sum = 0.0;
     for (var i: u32 = 0u; i < params.k; i = i + 1u) {
-        sum = sum + src0[col * params.k + i] * src1[row * params.k + i];
+        let src0_idx = src03_idx * src03_stride + src02_idx * src02_stride + col * params.k + i;
+        let src1_idx = src13_idx * src13_stride + src12_idx * src12_stride + row * params.k + i;
+        sum = sum + src0[src0_idx] * src1[src1_idx];
     }
-    dst[row * params.n + col] = sum;
+    dst[dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum;
 }
\ No newline at end of file

From ecb945ebbd36bf4a2e8fd5db59acfc5419a10f72 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 24 Jun 2025 17:16:37 -0700
Subject: [PATCH 18/25] Pass thread safety test

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 236 +++++++++++++-----
 ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl    |  32 +++
 ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl |   7 +-
 3 files changed, 212 insertions(+), 63 deletions(-)
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index c1e08fd0e137a..d8aa0206bbab8 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -23,6 +23,8 @@
 
 #define WEBGPU_MUL_MAT_WG_SIZE 64
 #define WEBGPU_MUL_MAT_PARAMS_SIZE (7 * sizeof(uint32_t)) // M, N, K, batch sizes, broadcasts
+#define WEBGPU_CPY_PARAMS_SIZE (3 * sizeof(uint32_t)) // number of elements to copy, alignments
+#define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4
 
 /* End Constants */
 
@@ -46,6 +48,10 @@ struct webgpu_context_struct {
     wgpu::Device device;
     wgpu::Queue queue;
     wgpu::Limits limits;
+    wgpu::SupportedFeatures features;
+
+    std::mutex mutex;
+    bool device_initialized = false;
 
     // pipelines and parameter buffers
     // TODO: reuse params buffers for different pipelines when possible
@@ -55,8 +61,11 @@ struct webgpu_context_struct {
     wgpu::ComputePipeline mul_mat_pipeline;
     wgpu::Buffer mul_mat_params_dev_buf;
     wgpu::Buffer mul_mat_params_host_buf;
+    wgpu::ComputePipeline cpy_pipeline;
+    wgpu::Buffer cpy_params_dev_buf;
+    wgpu::Buffer cpy_params_host_buf;
 
-    size_t memset_elems_per_thread;
+    size_t memset_bytes_per_thread;
 
     // Staging buffer for reading data from the GPU
     wgpu::Buffer get_tensor_staging_buf;
@@ -118,12 +127,13 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &device, wgpu::ComputePipel
     pipeline = device.CreateComputePipeline(&pipeline_desc);
 }
 
-static void ggml_webgpu_create_buffer(wgpu::Device &device, wgpu::Buffer &buffer, size_t size, wgpu::BufferUsage usage) {
+static void ggml_webgpu_create_buffer(wgpu::Device &device, wgpu::Buffer &buffer, size_t size, wgpu::BufferUsage usage, const char* label) {
     WEBGPU_LOG_DEBUG("ggml_webgpu_create_buffer()");
 
     wgpu::BufferDescriptor buffer_desc;
     buffer_desc.size = size;
     buffer_desc.usage = usage;
+    buffer_desc.label = label;
     buffer_desc.mappedAtCreation = false; 
     // TODO: error handling
     buffer = device.CreateBuffer(&buffer_desc);
@@ -146,6 +156,7 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context ctx, wgpu::Buffer buff
 }
 
 static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer buf, uint32_t value, size_t offset, size_t size) {
+    std::lock_guard<std::mutex> lock(ctx->mutex);
     wgpu::Device device = ctx->device;
 
     // map the host parameters buffer
@@ -170,6 +181,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b
     wgpu::BindGroupDescriptor bind_group_desc;
     bind_group_desc.layout = ctx->memset_pipeline.GetBindGroupLayout(0);
     bind_group_desc.entryCount = 2;
+    bind_group_desc.label = "ggml_memset";
     bind_group_desc.entries = entries;
     wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc);
 
@@ -182,8 +194,8 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b
     wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
     pass.SetPipeline(ctx->memset_pipeline);
     pass.SetBindGroup(0, bind_group);
-    size_t elems_per_wg = ctx->limits.maxComputeWorkgroupSizeX * ctx->memset_elems_per_thread;
-    pass.DispatchWorkgroups((((size + 3)/4) + elems_per_wg - 1) / elems_per_wg, 1, 1);
+    size_t bytes_per_wg = ctx->limits.maxComputeWorkgroupSizeX * ctx->memset_bytes_per_thread;
+    pass.DispatchWorkgroups(((size + 3) + bytes_per_wg - 1) / bytes_per_wg, 1, 1);
     pass.End();
     wgpu::CommandBuffer commands = encoder.Finish();
 
@@ -191,6 +203,18 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b
     ctx->queue.Submit(1, &commands);
 }
 
+static void ggml_backend_webgpu_wait_on_submission(webgpu_context ctx) {
+    // Wait for the queue to finish processing all commands
+    ctx->instance.WaitAny(ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::WaitAnyOnly,
+        [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
+            if (status != wgpu::QueueWorkDoneStatus::Success) {
+                GGML_LOG_ERROR("ggml_webgpu: Failed to wait on queue: %s\n", message.data);
+            }
+        }), 
+        UINT64_MAX
+    );
+}
+
 /** End WebGPU Actions */
 
 /** GGML Backend Interface */
@@ -218,17 +242,82 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
 
 
     switch (node->op) {
-        // no-op
+        // no-ops
         case GGML_OP_NONE:
-        // these next four ops modify the logical view of the tensor, but do not change its data
-        case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
             return false;
+        
+        case GGML_OP_CPY: {
+            std::lock_guard<std::mutex> lock(ctx->mutex);
+            const ggml_tensor * src = node->src[0];
+            ggml_backend_webgpu_buffer_context * src_ctx = (ggml_backend_webgpu_buffer_context *) src->buffer->context;
+            size_t src_offset = webgpu_tensor_offset(src) + src->view_offs;
+            // assumes power of 2 offset alignment
+            size_t src_misalignment = src_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
+            // align to minimum offset alignment
+            src_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
+            ggml_backend_webgpu_buffer_context * dst_ctx = (ggml_backend_webgpu_buffer_context *) node->buffer->context;
+            size_t dst_offset = webgpu_tensor_offset(node) + node->view_offs;
+            size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1);
+            dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
+
+            wgpu::Device device = ctx->device;
+            ggml_backend_webgpu_map_buffer(ctx, ctx->cpy_params_host_buf, 
+                wgpu::MapMode::Write, 0, ctx->cpy_params_host_buf.GetSize());
+            uint32_t * params = (uint32_t *) ctx->cpy_params_host_buf.GetMappedRange();
+            uint32_t ne = (uint32_t)ggml_nelements(node); // number of elements to copy
+            params[0] = ne;
+            params[1] = src_misalignment;
+            params[2] = dst_misalignment;
+            ctx->cpy_params_host_buf.Unmap();
+
+            wgpu::BindGroupEntry entries[3];
+            entries[0].binding = 0;
+            entries[0].buffer = src_ctx->buffer;
+            entries[0].offset = src_offset;
+            entries[0].size = (ggml_nbytes(src) + src_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1);
+
+            entries[1].binding = 1;
+            entries[1].buffer = dst_ctx->buffer;
+            entries[1].offset = dst_offset;
+            entries[1].size = (ggml_nbytes(node) + dst_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1);
+
+            entries[2].binding = 2;
+            entries[2].buffer = ctx->cpy_params_dev_buf;
+            entries[2].offset = 0;
+            entries[2].size = ctx->cpy_params_dev_buf.GetSize();
+
+            wgpu::BindGroupDescriptor bind_group_desc;
+            bind_group_desc.layout = ctx->cpy_pipeline.GetBindGroupLayout(0);
+            bind_group_desc.label = "ggml_op_cpy";
+            bind_group_desc.entryCount = 3;
+            bind_group_desc.entries = entries;
+            wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc);
+
+            wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
+            encoder.CopyBufferToBuffer(
+                ctx->cpy_params_host_buf, 0, 
+                ctx->cpy_params_dev_buf, 0, 
+                ctx->cpy_params_dev_buf.GetSize()
+            );
+            wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
+            pass.SetPipeline(ctx->cpy_pipeline);
+            pass.SetBindGroup(0, bind_group);
+            size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX;
+            pass.DispatchWorkgroups((ne + max_wg_size - 1) / max_wg_size);
+            pass.End();
+            wgpu::CommandBuffer commands = encoder.Finish();
+
+            // TODO, don't submit here, batch submissions
+            ctx->queue.Submit(1, &commands);
+            // TODO, don't wait on submission here
+            ggml_backend_webgpu_wait_on_submission(ctx);
+            return true;
+        }
 
-        // basic matrix multiplication for now, 2d tensors only
-        case GGML_OP_MUL_MAT: {
+        case GGML_OP_MUL_MAT:
+         {
             const ggml_tensor * src0 = node->src[0];
             ggml_backend_webgpu_buffer_context * src0_ctx = (ggml_backend_webgpu_buffer_context *) src0->buffer->context;
             size_t src0_offset = webgpu_tensor_offset(src0) + src0->view_offs;
@@ -257,22 +346,22 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
             ctx->mul_mat_params_host_buf.Unmap();
 
             wgpu::BindGroupEntry entries[4];
-            entries[0].binding = 0; // binding for the buffer to memset
+            entries[0].binding = 0;
             entries[0].buffer = src0_ctx->buffer;
             entries[0].offset = src0_offset;
             entries[0].size = ggml_nbytes(src0);
 
-            entries[1].binding = 1; // binding for the buffer to memset
+            entries[1].binding = 1; 
             entries[1].buffer = src1_ctx->buffer;
             entries[1].offset = src1_offset;
             entries[1].size = ggml_nbytes(src1);
 
-            entries[2].binding = 2; // binding for the buffer to memset
+            entries[2].binding = 2;
             entries[2].buffer = dst_ctx->buffer;
             entries[2].offset = dst_offset;
             entries[2].size = ggml_nbytes(node);
 
-            entries[3].binding = 3; // binding for the parameters
+            entries[3].binding = 3;
             entries[3].buffer = ctx->mul_mat_params_dev_buf;
             entries[3].offset = 0;
             entries[3].size = ctx->mul_mat_params_dev_buf.GetSize();
@@ -280,6 +369,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
             wgpu::BindGroupDescriptor bind_group_desc;
             bind_group_desc.layout = ctx->mul_mat_pipeline.GetBindGroupLayout(0);
             bind_group_desc.entryCount = 4;
+            bind_group_desc.label = "ggml_op_mul_mat";
             bind_group_desc.entries = entries;
             wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc);
 
@@ -298,6 +388,8 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
 
             // TODO, don't submit here, batch submissions
             ctx->queue.Submit(1, &commands);
+            // TODO, don't wait on submission here
+            ggml_backend_webgpu_wait_on_submission(ctx);
             return true;
         }
 
@@ -343,7 +435,6 @@ static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer)
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_free_buffer()");
     ggml_backend_webgpu_buffer_context * ctx = static_cast<ggml_backend_webgpu_buffer_context *>(buffer->context);
     ctx->buffer.Destroy();
-    delete ctx;
 }
 
 // Returns the "fake" base pointer.
@@ -405,6 +496,8 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
         final_size = size + (4 - (size % 4));
     }
 
+    std::lock_guard<std::mutex> lock(webgpu_ctx->mutex);
+
     if (webgpu_ctx->get_tensor_staging_buf == nullptr || 
         webgpu_ctx->get_tensor_staging_buf.GetSize() < final_size) {
         // Create a new staging buffer if it doesn't exist or is too small
@@ -412,7 +505,7 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
             webgpu_ctx->get_tensor_staging_buf.Destroy();
         }
         ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, final_size, 
-            wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead);
+            wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "get_tensor_staging_buf");
     }
 
     // Copy the data from the buffer to the staging buffer
@@ -433,10 +526,10 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
 }
 
 static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << value << ")");
+    WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << (uint32_t) value << ")");
 
     ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
-    ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, 0, buf_ctx->buffer.GetSize());
+    ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, 0, buffer->size);
 }
 
 static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = {
@@ -466,7 +559,7 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b
 
     wgpu::Buffer buf;
     ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf, size, 
-        wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst);
+        wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst, "allocated_buffer");
 
     ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf);
 
@@ -532,27 +625,39 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context webgpu_ctx) {
     // we use the maximum workgroup size for the memset pipeline
     size_t max_wg_size = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
     size_t max_threads = max_wg_size * webgpu_ctx->limits.maxComputeWorkgroupsPerDimension;
-    // Size the elems_per_thread so that the largest buffer size can be handled
-    webgpu_ctx->memset_elems_per_thread = (webgpu_ctx->limits.maxStorageBufferBindingSize / sizeof(uint32_t) + max_threads - 1) / max_threads;
+    // Size the bytes_per_thread so that the largest buffer size can be handled
+    webgpu_ctx->memset_bytes_per_thread = (webgpu_ctx->limits.maxStorageBufferBindingSize + max_threads - 1) / max_threads;
     std::vector<wgpu::ConstantEntry> constants(2);
     constants[0].key = "wg_size";
     constants[0].value = max_wg_size;
-    constants[1].key = "elems_per_thread";
-    constants[1].value = webgpu_ctx->memset_elems_per_thread;
+    constants[1].key = "bytes_per_thread";
+    constants[1].value = webgpu_ctx->memset_bytes_per_thread;
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->memset_pipeline, wgsl_memset, "memset", constants);
     ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_dev_buf, 
         3 * sizeof(uint32_t), // 3 parameters: buffer size, offset, value
-        wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst);
+        wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst, "memset_params_dev_buf");
     ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_host_buf,
-        3 * sizeof(uint32_t), wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc);
+        3 * sizeof(uint32_t), wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc, "memset_params_host_buf");
 }
 
 static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context webgpu_ctx) {
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline, wgsl_mul_mat, "mul_mat");
     ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_dev_buf, WEBGPU_MUL_MAT_PARAMS_SIZE,
-        wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst);
-    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_host_buf,WEBGPU_MUL_MAT_PARAMS_SIZE,
-        wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc);
+        wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst, "mul_mat_params_dev_buf");
+    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_host_buf, WEBGPU_MUL_MAT_PARAMS_SIZE,
+        wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc, "mul_mat_params_host_buf");
+}
+
+static void ggml_webgpu_init_cpy_pipeline(webgpu_context webgpu_ctx) {
+    std::vector<wgpu::ConstantEntry> constants(1);
+    constants[0].key = "wg_size";
+    constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX;
+
+    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline, wgsl_cpy, "cpy", constants);
+    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->cpy_params_dev_buf, WEBGPU_CPY_PARAMS_SIZE,
+        wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst, "cpy_params_dev_buf");
+    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->cpy_params_host_buf, WEBGPU_CPY_PARAMS_SIZE,
+        wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc, "cpy_params_host_buf");
 }
 
 // TODO: Does this need to be thread safe? Is it only called once?
@@ -564,36 +669,44 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
     ggml_backend_webgpu_device_context * dev_ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
     webgpu_context webgpu_ctx = dev_ctx->webgpu_ctx;
 
-    // Initialize device
-    wgpu::DeviceDescriptor dev_desc;
-    dev_desc.requiredLimits = &webgpu_ctx->limits;
-    dev_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, 
-        [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
-            GGML_UNUSED(device);
-            GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
-    });
-    dev_desc.SetUncapturedErrorCallback(
-        [](const wgpu::Device& device, wgpu::ErrorType reason, wgpu::StringView message) {
-            GGML_UNUSED(device);
-            GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
-    });
-    webgpu_ctx->instance.WaitAny(webgpu_ctx->adapter.RequestDevice(&dev_desc, wgpu::CallbackMode::WaitAnyOnly,
-        [webgpu_ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
-            if (status != wgpu::RequestDeviceStatus::Success) {
-                GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data);
-                return;
-            }
-            webgpu_ctx->device = device;
-        }),
-        UINT64_MAX
-    );
-    GGML_ASSERT(webgpu_ctx->device != nullptr);
-
-    // Initialize (compute) queue
-    webgpu_ctx->queue = webgpu_ctx->device.GetQueue();
-
-    ggml_webgpu_init_memset_pipeline(webgpu_ctx);
-    ggml_webgpu_init_mul_mat_pipeline(webgpu_ctx);
+    std::lock_guard<std::mutex> lock(webgpu_ctx->mutex);
+
+    if (!webgpu_ctx->device_initialized) {
+        // Initialize device
+        wgpu::DeviceDescriptor dev_desc;
+        dev_desc.requiredLimits = &webgpu_ctx->limits;
+        dev_desc.requiredFeatures = webgpu_ctx->features.features;
+        dev_desc.requiredFeatureCount = webgpu_ctx->features.featureCount;
+        dev_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, 
+            [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
+                GGML_UNUSED(device);
+                GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
+        });
+        dev_desc.SetUncapturedErrorCallback(
+            [](const wgpu::Device& device, wgpu::ErrorType reason, wgpu::StringView message) {
+                GGML_UNUSED(device);
+                GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
+        });
+        webgpu_ctx->instance.WaitAny(webgpu_ctx->adapter.RequestDevice(&dev_desc, wgpu::CallbackMode::WaitAnyOnly,
+            [webgpu_ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
+                if (status != wgpu::RequestDeviceStatus::Success) {
+                    GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data);
+                    return;
+                }
+                webgpu_ctx->device = device;
+            }),
+            UINT64_MAX
+        );
+        GGML_ASSERT(webgpu_ctx->device != nullptr);
+
+        // Initialize (compute) queue
+        webgpu_ctx->queue = webgpu_ctx->device.GetQueue();
+
+        ggml_webgpu_init_memset_pipeline(webgpu_ctx);
+        ggml_webgpu_init_mul_mat_pipeline(webgpu_ctx);
+        ggml_webgpu_init_cpy_pipeline(webgpu_ctx);
+        webgpu_ctx->device_initialized = true;
+    }
 
     static ggml_backend_webgpu_context backend_ctx;
     backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + dev_ctx->device_name;
@@ -636,10 +749,13 @@ static bool ggml_backend_webgpu_device_supports_buft(ggml_backend_dev_t dev, ggm
 static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
     GGML_UNUSED(dev);
 
-    // what should we support first?
     switch (op->op) {
         case GGML_OP_NONE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
             return true;
+        case GGML_OP_CPY:
+            return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_MUL_MAT:
             return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
         default:
@@ -702,6 +818,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     GGML_ASSERT(ctx->adapter != nullptr);
 
     ctx->adapter.GetLimits(&ctx->limits);
+    ctx->adapter.GetFeatures(&ctx->features);
 
     wgpu::AdapterInfo info{};
     ctx->adapter.GetInfo(&info);
@@ -738,6 +855,7 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg()");
 
     webgpu_context webgpu_ctx = std::make_shared<webgpu_context_struct>();
+    webgpu_ctx->device_initialized = false;
 
     static ggml_backend_webgpu_reg_context ctx;
     ctx.webgpu_ctx = webgpu_ctx;
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
new file mode 100644
index 0000000000000..fb14916578f57
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
@@ -0,0 +1,32 @@
+enable f16;
+
+@group(0) @binding(0)
+var<storage, read_write> src: array<f32>;
+
+@group(0) @binding(1)
+var<storage, read_write> dst: array<f16>;
+
+struct Params {
+    ne: u32, // number of elements
+    src_offset: u32, // src offset in bytes
+    dst_offset: u32 // dst offset in bytes
+};
+
+@group(0) @binding(2)
+var<uniform> params: Params;
+
+override wg_size: u32;
+const elems_per_thread: u32 = 4;
+
+@compute @workgroup_size(wg_size)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x * elems_per_thread;
+    // chunked loop
+    for (var j: u32 = 0u; j < elems_per_thread; j = j + 1u) {
+        let i = idx + j;
+        if (i < params.ne) {
+            // Convert f32 to f16
+            dst[dst_offset/2 + i] = f16(src[src_offset/4 + i]);
+        }
+    }
+}
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl
index 60142c4fa0839..cb7c8c3e09e91 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl
@@ -1,4 +1,3 @@
-// memset.wgsl
 @group(0) @binding(0)
 var<storage, read_write> output_buffer: array<u32>;
 
@@ -12,15 +11,15 @@ struct Params {
 var<uniform> params: Params;
 
 override wg_size: u32;
-override elems_per_thread: u32;
+override bytes_per_thread: u32;
 
 @compute @workgroup_size(wg_size)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let i = gid.x * 4u;
+    let i = gid.x * bytes_per_thread;
     let start = params.offset;
     let end = params.offset + params.size;
 
-    for (var j: u32 = 0u; j < elems_per_thread; j = j + 1u) {
+    for (var j: u32 = 0u; j < bytes_per_thread; j = j + 1u) {
         let byte_index = start + i + j;
         if (byte_index + 4u <= end) {
             output_buffer[(byte_index >> 2u)] = params.value;

From 0f0543b9f95afaea8c16a78b6a4eed0a8df421d5 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 2 Jul 2025 17:01:22 -0700
Subject: [PATCH 19/25] Implement permuting for mul_mat and cpy

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 38 ++++++++++---
 ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl    | 54 ++++++++++++++-----
 .../src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl | 19 +++----
 3 files changed, 82 insertions(+), 29 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index d8aa0206bbab8..8602baf9c15d6 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -22,8 +22,8 @@
 #define WEBGPU_MAX_BUFFERS 32
 
 #define WEBGPU_MUL_MAT_WG_SIZE 64
-#define WEBGPU_MUL_MAT_PARAMS_SIZE (7 * sizeof(uint32_t)) // M, N, K, batch sizes, broadcasts
-#define WEBGPU_CPY_PARAMS_SIZE (3 * sizeof(uint32_t)) // number of elements to copy, alignments
+#define WEBGPU_MUL_MAT_PARAMS_SIZE (13 * sizeof(uint32_t)) // M, N, K, batch sizes, broadcasts
+#define WEBGPU_CPY_PARAMS_SIZE (15 * sizeof(uint32_t)) // strides and offsets
 #define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4
 
 /* End Constants */
@@ -266,10 +266,26 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
             ggml_backend_webgpu_map_buffer(ctx, ctx->cpy_params_host_buf, 
                 wgpu::MapMode::Write, 0, ctx->cpy_params_host_buf.GetSize());
             uint32_t * params = (uint32_t *) ctx->cpy_params_host_buf.GetMappedRange();
-            uint32_t ne = (uint32_t)ggml_nelements(node); // number of elements to copy
+            uint32_t ne = (uint32_t)ggml_nelements(node);
             params[0] = ne;
             params[1] = src_misalignment;
             params[2] = dst_misalignment;
+
+            // Convert byte-strides to element-strides
+            params[3] = (uint32_t)src->nb[0]/ggml_type_size(src->type);
+            params[4] = (uint32_t)src->nb[1]/ggml_type_size(src->type);
+            params[5] = (uint32_t)src->nb[2]/ggml_type_size(src->type);
+            params[6] = (uint32_t)src->nb[3]/ggml_type_size(src->type);
+            params[7] = (uint32_t)node->nb[0]/ggml_type_size(node->type);
+            params[8] = (uint32_t)node->nb[1]/ggml_type_size(node->type);
+            params[9] = (uint32_t)node->nb[2]/ggml_type_size(node->type);
+            params[10] = (uint32_t)node->nb[3]/ggml_type_size(node->type);
+            // Logical shape — same for both tensors even if permuted
+            params[11] = (uint32_t)(src->ne[0]);
+            params[12] = (uint32_t)(src->ne[1]);
+            params[13] = (uint32_t)(src->ne[2]);
+            params[14] = (uint32_t)(src->ne[3]);
+
             ctx->cpy_params_host_buf.Unmap();
 
             wgpu::BindGroupEntry entries[3];
@@ -338,10 +354,18 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
             params[0] = (uint32_t)node->ne[1]; // number of rows in result (M)
             params[1] = (uint32_t)node->ne[0]; // number of columns in result (N)
             params[2] = (uint32_t)src0->ne[0]; // number of columns in src0/src1 (K)
-            params[3] = (uint32_t)src0->ne[2]; // batch size in dimension 2
-            params[4] = (uint32_t)src0->ne[3]; // batch size in dimension 3
-            params[5] = (uint32_t)(src1->ne[2]/src0->ne[2]); // broadcast in dimension 2
-            params[6] = (uint32_t)(src1->ne[3]/src0->ne[3]); // broadcast in dimension 3
+
+            params[3] = (uint32_t)src0->nb[1]/ggml_type_size(src0->type); // stride (elements) of src0 in dimension 1
+            params[4] = (uint32_t)src1->nb[1]/ggml_type_size(src1->type); // stride (elements) of src1 in dimension 1
+            params[5] = (uint32_t)src0->nb[2]/ggml_type_size(src0->type); // stride (elements) of src0 in dimension 2
+            params[6] = (uint32_t)src1->nb[2]/ggml_type_size(src1->type); // stride (elements) of src1 in dimension 2
+            params[7] = (uint32_t)src0->nb[3]/ggml_type_size(src0->type); // stride (elements) of src0 in dimension 3
+            params[8] = (uint32_t)src1->nb[3]/ggml_type_size(src1->type); // stride (elements) of src1 in dimension 3
+
+            params[9] = (uint32_t)src0->ne[2]; // batch size in dimension 2
+            params[10] = (uint32_t)src0->ne[3]; // batch size in dimension 3
+            params[11] = (uint32_t)(src1->ne[2]/src0->ne[2]); // broadcast in dimension 2
+            params[12] = (uint32_t)(src1->ne[3]/src0->ne[3]); // broadcast in dimension 3
 
             ctx->mul_mat_params_host_buf.Unmap();
 
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
index fb14916578f57..6b18d68094831 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
@@ -7,26 +7,54 @@ var<storage, read_write> src: array<f32>;
 var<storage, read_write> dst: array<f16>;
 
 struct Params {
-    ne: u32, // number of elements
-    src_offset: u32, // src offset in bytes
-    dst_offset: u32 // dst offset in bytes
+    ne: u32,            // total number of elements
+    src_offset: u32,    // in bytes
+    dst_offset: u32,    // in bytes
+
+    // Strides (in elements) — may be permuted
+    stride_src0: u32,
+    stride_src1: u32,
+    stride_src2: u32,
+    stride_src3: u32,
+
+    stride_dst0: u32,
+    stride_dst1: u32,
+    stride_dst2: u32,
+    stride_dst3: u32,
+
+    // Logical shape (same for both tensors)
+    ne0: u32,
+    ne1: u32,
+    ne2: u32,
+    ne3: u32,
 };
 
 @group(0) @binding(2)
 var<uniform> params: Params;
 
 override wg_size: u32;
-const elems_per_thread: u32 = 4;
-
 @compute @workgroup_size(wg_size)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x * elems_per_thread;
-    // chunked loop
-    for (var j: u32 = 0u; j < elems_per_thread; j = j + 1u) {
-        let i = idx + j;
-        if (i < params.ne) {
-            // Convert f32 to f16
-            dst[dst_offset/2 + i] = f16(src[src_offset/4 + i]);
-        }
+    if (gid.x >= params.ne) {
+        return;
     }
+
+    var i = gid.x;
+
+    let i3 = i / (params.ne2 * params.ne1 * params.ne0);
+    i = i % (params.ne2 * params.ne1 * params.ne0);
+
+    let i2 = i / (params.ne1 * params.ne0);
+    i = i % (params.ne1 * params.ne0);
+
+    let i1 = i / params.ne0;
+    let i0 = i % params.ne0;
+
+    let src_idx = i0 * params.stride_src0 + i1 * params.stride_src1 +
+                  i2 * params.stride_src2 + i3 * params.stride_src3;
+
+    let dst_idx = i0 * params.stride_dst0 + i1 * params.stride_dst1 +
+                  i2 * params.stride_dst2 + i3 * params.stride_dst3;
+
+    dst[params.dst_offset / 2 + dst_idx] = f16(src[params.src_offset / 4 + src_idx]);
 }
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
index 31973278ee18f..3b8b31474ca90 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
@@ -2,6 +2,13 @@ struct MulMatParams {
     m: u32,
     n: u32,
     k: u32,
+    stride_01: u32,
+    stride_11: u32,
+    stride_02: u32,
+    stride_12: u32,
+    stride_03: u32,
+    stride_13: u32,
+
     bs02: u32,
     bs03: u32,
     broadcast2: u32,
@@ -21,12 +28,6 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
         return;
     }
 
-    let src02_stride = params.n * params.k;
-    let src03_stride = src02_stride * params.bs02;
-
-    let src12_stride = params.m * params.k;
-    let src13_stride = src12_stride * params.bs02 * params.broadcast2;
-
     let dst2_stride = params.m * params.n;
     let dst3_stride = dst2_stride * params.bs02 * params.broadcast2;
 
@@ -37,7 +38,7 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
 
     let dst2_idx = dst3_rem / dst2_stride;
     let src02_idx = dst2_idx / params.broadcast2; // src0 may also be broadcast along the second dimension
-    let src12_idx = dst2_idx;
+    let src12_idx = dst2_idx; // src1 is not broadcast
 
     let dst2_rem = dst3_rem % dst2_stride; 
 
@@ -46,8 +47,8 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
 
     var sum = 0.0;
     for (var i: u32 = 0u; i < params.k; i = i + 1u) {
-        let src0_idx = src03_idx * src03_stride + src02_idx * src02_stride + col * params.k + i;
-        let src1_idx = src13_idx * src13_stride + src12_idx * src12_stride + row * params.k + i;
+        let src0_idx = src03_idx * params.stride_03 + src02_idx * params.stride_02 + col * params.stride_01 + i;
+        let src1_idx = src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11 + i;
         sum = sum + src0[src0_idx] * src1[src1_idx];
     }
     dst[dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum;

From 2eb7626dbbe4dff7c5c59f95ebbe7b9b918191bf Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Thu, 3 Jul 2025 11:48:22 -0700
Subject: [PATCH 20/25] minor cleanups

---
 docs/build.md                              |  2 +-
 ggml/src/ggml-webgpu/ggml-webgpu.cpp       | 13 ++++---------
 ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl |  6 +++---
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/docs/build.md b/docs/build.md
index 5f735de9ed966..1b6fa8733c93c 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -572,7 +572,7 @@ cmake --build build --config Release
 
 WebGPU allows cross-platform access to the GPU from supported browsers. We utilize [Emscripten](https://emscripten.org/) to compile ggml's WebGPU backend to WebAssembly. Emscripten does not officially support WebGPU bindings yet, but Dawn currently maintains its own WebGPU bindings called emdawnwebgpu. 
 
-Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/src/emdawnwebgpu/) to download or build the emdawnwebgpu package (Note that it might be safer to build them locally, so that they stay in sync with the version of Dawn you have installed above). When building using CMake, the path to the emdawnwebgpu port file needs to be set with the flag `EMDAWNWEBGPU_DIR`.
+Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/src/emdawnwebgpu/) to download or build the emdawnwebgpu package (Note that it might be safer to build the emdawbwebgpu package locally, so that it stays in sync with the version of Dawn you have installed above). When building using CMake, the path to the emdawnwebgpu port file needs to be set with the flag `EMDAWNWEBGPU_DIR`.
 
 ## Notes about GPU-accelerated backends
 
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 8602baf9c15d6..c072f279dcb63 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -18,9 +18,6 @@
 
 /* Constants */
 
-// TODO: find a better way to get the memory available
-#define WEBGPU_MAX_BUFFERS 32
-
 #define WEBGPU_MUL_MAT_WG_SIZE 64
 #define WEBGPU_MUL_MAT_PARAMS_SIZE (13 * sizeof(uint32_t)) // M, N, K, batch sizes, broadcasts
 #define WEBGPU_CPY_PARAMS_SIZE (15 * sizeof(uint32_t)) // strides and offsets
@@ -119,7 +116,7 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &device, wgpu::ComputePipel
     pipeline_desc.label = label;
     pipeline_desc.compute.module = shader_module;
     pipeline_desc.compute.entryPoint = "main"; // Entry point in the WGSL code
-    pipeline_desc.layout = nullptr; // Guessing that nullptr means auto layout
+    pipeline_desc.layout = nullptr; // nullptr means auto layout
     if (constants.size() > 0) {
         pipeline_desc.compute.constants = constants.data();
         pipeline_desc.compute.constantCount = constants.size();
@@ -199,7 +196,6 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b
     pass.End();
     wgpu::CommandBuffer commands = encoder.Finish();
 
-    // TODO, async, do we need to wait on this?
     ctx->queue.Submit(1, &commands);
 }
 
@@ -489,7 +485,6 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
 
     size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
 
-    // TODO: wait on this?
     webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size/4)*4);
 
     if (size % 4 != 0) {
@@ -617,9 +612,9 @@ static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_
 
 static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
-    // TODO: what do we actually want to return here?
-    *free = ctx->webgpu_ctx->limits.maxBufferSize * WEBGPU_MAX_BUFFERS;
-    *total = ctx->webgpu_ctx->limits.maxBufferSize * WEBGPU_MAX_BUFFERS;
+    // TODO: what do we actually want to return here? maxBufferSize might not be the full available memory.
+    *free = ctx->webgpu_ctx->limits.maxBufferSize;
+    *total = ctx->webgpu_ctx->limits.maxBufferSize;
 }
 
 static enum ggml_backend_dev_type ggml_backend_webgpu_device_get_type(ggml_backend_dev_t dev) {
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
index 6b18d68094831..8c05312425c0b 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
@@ -8,8 +8,8 @@ var<storage, read_write> dst: array<f16>;
 
 struct Params {
     ne: u32,            // total number of elements
-    src_offset: u32,    // in bytes
-    dst_offset: u32,    // in bytes
+    offset_src: u32,    // in bytes
+    offset_dst: u32,    // in bytes
 
     // Strides (in elements) — may be permuted
     stride_src0: u32,
@@ -56,5 +56,5 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let dst_idx = i0 * params.stride_dst0 + i1 * params.stride_dst1 +
                   i2 * params.stride_dst2 + i3 * params.stride_dst3;
 
-    dst[params.dst_offset / 2 + dst_idx] = f16(src[params.src_offset / 4 + src_idx]);
+    dst[params.offset_dst / 2 + dst_idx] = f16(src[params.offset_src / 4 + src_idx]);
 }

From cbf4b96924bc2f24ef37560b1561b69f91c855d7 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Sun, 6 Jul 2025 08:40:28 -0700
Subject: [PATCH 21/25] Address feedback

---
 docs/build.md                                  |  2 +-
 ggml/src/ggml-webgpu/ggml-webgpu.cpp           | 13 +++++++------
 ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl     |  4 ++--
 ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl |  3 ++-
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/docs/build.md b/docs/build.md
index d15cb19b61d37..eae142dba003a 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -559,7 +559,7 @@ To read documentation for how to build on Android, [click here](./android.md)
 
 ## WebGPU [In Progress]
 
-The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake.
+The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The currrent implementation is up-to-date with Dawn commit `bed1a61`.
 
 In the llama.cpp directory, build with CMake:
 
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index c072f279dcb63..ac792521054b3 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -264,8 +264,8 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
             uint32_t * params = (uint32_t *) ctx->cpy_params_host_buf.GetMappedRange();
             uint32_t ne = (uint32_t)ggml_nelements(node);
             params[0] = ne;
-            params[1] = src_misalignment;
-            params[2] = dst_misalignment;
+            params[1] = src_misalignment/ggml_type_size(src->type);
+            params[2] = dst_misalignment/ggml_type_size(node->type);
 
             // Convert byte-strides to element-strides
             params[3] = (uint32_t)src->nb[0]/ggml_type_size(src->type);
@@ -881,10 +881,11 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
     ctx.name = GGML_WEBGPU_NAME;
     ctx.device_count = 1;
 
-
-    wgpu::InstanceDescriptor instanceDescriptor{};
-    instanceDescriptor.capabilities.timedWaitAnyEnable = true;
-    webgpu_ctx->instance = wgpu::CreateInstance(&instanceDescriptor);
+    wgpu::InstanceDescriptor instance_descriptor{};
+    std::vector<wgpu::InstanceFeatureName> instance_features = {wgpu::InstanceFeatureName::TimedWaitAny};
+    instance_descriptor.requiredFeatures = instance_features.data();
+    instance_descriptor.requiredFeatureCount = instance_features.size();
+    webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor);
     GGML_ASSERT(webgpu_ctx->instance != nullptr);
 
     static ggml_backend_reg reg = {
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
index 8c05312425c0b..5c2e2cfdd6d17 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
@@ -8,8 +8,8 @@ var<storage, read_write> dst: array<f16>;
 
 struct Params {
     ne: u32,            // total number of elements
-    offset_src: u32,    // in bytes
-    offset_dst: u32,    // in bytes
+    offset_src: u32,    // in elements
+    offset_dst: u32,    // in elements
 
     // Strides (in elements) — may be permuted
     stride_src0: u32,
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
index 3b8b31474ca90..7a7a42f23d9ae 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
@@ -2,6 +2,7 @@ struct MulMatParams {
     m: u32,
     n: u32,
     k: u32,
+    // all strides are in elements
     stride_01: u32,
     stride_11: u32,
     stride_02: u32,
@@ -16,7 +17,7 @@ struct MulMatParams {
 };
 
 @group(0) @binding(0) var<storage, read_write> src0: array<f32>; // N rows, K columns
-@group(0) @binding(1) var<storage, read_write> src1: array<f32>; // M rows, K columns
+@group(0) @binding(1) var<storage, read_write> src1: array<f32>; // M rows, K columns (transposed)
 @group(0) @binding(2) var<storage, read_write> dst: array<f32>; // M rows, N columns
 
 @group(0) @binding(3) var<uniform> params: MulMatParams;

From e0d8a713bdb008cf02301e3e3b89737570380b4a Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Sun, 6 Jul 2025 11:04:10 -0700
Subject: [PATCH 22/25] Remove division by type size in cpy op

---
 ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
index 5c2e2cfdd6d17..6fe924c554cc3 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
@@ -56,5 +56,5 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let dst_idx = i0 * params.stride_dst0 + i1 * params.stride_dst1 +
                   i2 * params.stride_dst2 + i3 * params.stride_dst3;
 
-    dst[params.offset_dst / 2 + dst_idx] = f16(src[params.offset_src / 4 + src_idx]);
+    dst[params.offset_dst + dst_idx] = f16(src[params.offset_src + src_idx]);
 }

From 40dd1f06b32bf6bb43ccc2f5faa007fc0391660e Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 15 Jul 2025 16:57:46 -0700
Subject: [PATCH 23/25] Fix formatting and add github action workflows for
 vulkan and metal (m-series) webgpu backends

---
 .github/workflows/build.yml                   | 130 ++++++++++++++++++
 docs/build.md                                 |   4 +-
 ggml/src/ggml-webgpu/CMakeLists.txt           |   2 +-
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          |  46 ++++---
 .../ggml-webgpu/wgsl-shaders/embed_wgsl.py    |   4 +
 .../src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl |   4 +-
 6 files changed, 163 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 42d63b7c5444c..506a1b12f4d28 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -135,6 +135,70 @@ jobs:
           cd build
           ctest -L main --verbose --timeout 900
 
+  macOS-latest-cmake-arm64-webgpu:
+    runs-on: macos-14
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-cmake-arm64-webgpu
+          evict-old-files: 1d
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+          brew install curl
+
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          ARTIFACTS_JSON=$(curl -s -L \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "https://api.github.com/repos/google/dawn/actions/artifacts")
+          echo "Finding latest macos-latest-Release artifact..."
+          DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
+            | sort_by(.created_at)
+            | reverse
+            | map(select(.name | test("macos-latest-Release$")))
+            | .[0].archive_download_url')
+          if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
+            echo "No suitable Dawn artifact found!"
+            exit 1
+          fi
+          echo "Downloading from: $DOWNLOAD_URL"
+          curl -L \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -o artifact.zip "$DOWNLOAD_URL"
+          unzip artifact.zip
+          mkdir dawn
+          tar_file=$(find . -name '*.tar.gz' | head -n 1)
+          echo "Extracting: $tar_file"
+          tar -xvf "$tar_file" -C dawn --strip-components=1
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          export Dawn_DIR=dawn/lib64/cmake/Dawn
+          cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
   ubuntu-cpu-cmake:
     strategy:
       matrix:
@@ -344,6 +408,72 @@ jobs:
           # This is using llvmpipe and runs slower than other backends
           ctest -L main --verbose --timeout 3600
 
+  ubuntu-22-cmake-webgpu:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-cmake-webgpu
+          evict-old-files: 1d
+
+      - name: Vulkan SDK Dependencies
+        id: vulkan-depends
+        run: |
+          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
+          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev
+
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
+          ARTIFACTS_JSON=$(curl -s -L \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "https://api.github.com/repos/google/dawn/actions/artifacts")
+          echo "Finding latest ubuntu-latest-Release artifact..."
+          DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
+            | sort_by(.created_at)
+            | reverse
+            | map(select(.name | test("ubuntu-latest-Release$")))
+            | .[0].archive_download_url')
+          if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
+            echo "No suitable Dawn artifact found!"
+            exit 1
+          fi
+          echo "Downloading from: $DOWNLOAD_URL"
+          curl -L \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -o artifact.zip "$DOWNLOAD_URL"
+          unzip artifact.zip
+          mkdir dawn
+          tar_file=$(find . -name '*.tar.gz' | head -n 1)
+          echo "Extracting: $tar_file"
+          tar -xvf "$tar_file" -C dawn --strip-components=1
+
+      - name: Build
+        id: cmake_build
+        run: |
+          export Dawn_DIR=dawn/lib64/cmake/Dawn
+          cmake -B build -DGGML_WEBGPU=ON
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          # This is using llvmpipe and runs slower than other backends
+          ctest -L main --verbose --timeout 3600
+
   ubuntu-22-cmake-hip:
     runs-on: ubuntu-22.04
     container: rocm/dev-ubuntu-22.04:6.0.2
diff --git a/docs/build.md b/docs/build.md
index eae142dba003a..70767ad91c056 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -568,9 +568,9 @@ cmake -B build -DGGML_WEBGPU=ON
 cmake --build build --config Release
 ```
 
-### Browser Support 
+### Browser Support
 
-WebGPU allows cross-platform access to the GPU from supported browsers. We utilize [Emscripten](https://emscripten.org/) to compile ggml's WebGPU backend to WebAssembly. Emscripten does not officially support WebGPU bindings yet, but Dawn currently maintains its own WebGPU bindings called emdawnwebgpu. 
+WebGPU allows cross-platform access to the GPU from supported browsers. We utilize [Emscripten](https://emscripten.org/) to compile ggml's WebGPU backend to WebAssembly. Emscripten does not officially support WebGPU bindings yet, but Dawn currently maintains its own WebGPU bindings called emdawnwebgpu.
 
 Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/src/emdawnwebgpu/) to download or build the emdawnwebgpu package (Note that it might be safer to build the emdawbwebgpu package locally, so that it stays in sync with the version of Dawn you have installed above). When building using CMake, the path to the emdawnwebgpu port file needs to be set with the flag `EMDAWNWEBGPU_DIR`.
 
diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt
index 0d67f11ba5520..79ef68b85a477 100644
--- a/ggml/src/ggml-webgpu/CMakeLists.txt
+++ b/ggml/src/ggml-webgpu/CMakeLists.txt
@@ -51,4 +51,4 @@ if (GGML_WEBGPU_DEBUG)
 endif()
 
 target_include_directories(ggml-webgpu PRIVATE ${SHADER_OUTPUT_DIR})
-target_link_libraries(ggml-webgpu PRIVATE ${DawnWebGPU_TARGET})
\ No newline at end of file
+target_link_libraries(ggml-webgpu PRIVATE ${DawnWebGPU_TARGET})
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index ac792521054b3..5366fd6a2ec34 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -7,7 +7,9 @@
 
 #include "ggml-wgsl-shaders.hpp"
 
+#include <cstring>
 #include <iostream>
+#include <mutex>
 #include <vector>
 
 #ifdef GGML_WEBGPU_DEBUG
@@ -131,7 +133,7 @@ static void ggml_webgpu_create_buffer(wgpu::Device &device, wgpu::Buffer &buffer
     buffer_desc.size = size;
     buffer_desc.usage = usage;
     buffer_desc.label = label;
-    buffer_desc.mappedAtCreation = false; 
+    buffer_desc.mappedAtCreation = false;
     // TODO: error handling
     buffer = device.CreateBuffer(&buffer_desc);
 }
@@ -161,7 +163,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b
     uint32_t * params = (uint32_t *) ctx->memset_params_host_buf.GetMappedRange();
 
     params[0] = (uint32_t)offset;
-    params[1] = (uint32_t)size; 
+    params[1] = (uint32_t)size;
     params[2] = value;
     ctx->memset_params_host_buf.Unmap();
 
@@ -184,8 +186,8 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b
 
     wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
     encoder.CopyBufferToBuffer(
-        ctx->memset_params_host_buf, 0, 
-        ctx->memset_params_dev_buf, 0, 
+        ctx->memset_params_host_buf, 0,
+        ctx->memset_params_dev_buf, 0,
         ctx->memset_params_dev_buf.GetSize()
     );
     wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
@@ -206,7 +208,7 @@ static void ggml_backend_webgpu_wait_on_submission(webgpu_context ctx) {
             if (status != wgpu::QueueWorkDoneStatus::Success) {
                 GGML_LOG_ERROR("ggml_webgpu: Failed to wait on queue: %s\n", message.data);
             }
-        }), 
+        }),
         UINT64_MAX
     );
 }
@@ -243,7 +245,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
             return false;
-        
+
         case GGML_OP_CPY: {
             std::lock_guard<std::mutex> lock(ctx->mutex);
             const ggml_tensor * src = node->src[0];
@@ -259,7 +261,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
             dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1);
 
             wgpu::Device device = ctx->device;
-            ggml_backend_webgpu_map_buffer(ctx, ctx->cpy_params_host_buf, 
+            ggml_backend_webgpu_map_buffer(ctx, ctx->cpy_params_host_buf,
                 wgpu::MapMode::Write, 0, ctx->cpy_params_host_buf.GetSize());
             uint32_t * params = (uint32_t *) ctx->cpy_params_host_buf.GetMappedRange();
             uint32_t ne = (uint32_t)ggml_nelements(node);
@@ -309,8 +311,8 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
 
             wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
             encoder.CopyBufferToBuffer(
-                ctx->cpy_params_host_buf, 0, 
-                ctx->cpy_params_dev_buf, 0, 
+                ctx->cpy_params_host_buf, 0,
+                ctx->cpy_params_dev_buf, 0,
                 ctx->cpy_params_dev_buf.GetSize()
             );
             wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
@@ -343,7 +345,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
             wgpu::Device device = ctx->device;
 
             // map the host parameters buffer
-            ggml_backend_webgpu_map_buffer(ctx, ctx->mul_mat_params_host_buf, 
+            ggml_backend_webgpu_map_buffer(ctx, ctx->mul_mat_params_host_buf,
                 wgpu::MapMode::Write, 0, ctx->mul_mat_params_host_buf.GetSize());
             uint32_t * params = (uint32_t *) ctx->mul_mat_params_host_buf.GetMappedRange();
 
@@ -371,7 +373,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
             entries[0].offset = src0_offset;
             entries[0].size = ggml_nbytes(src0);
 
-            entries[1].binding = 1; 
+            entries[1].binding = 1;
             entries[1].buffer = src1_ctx->buffer;
             entries[1].offset = src1_offset;
             entries[1].size = ggml_nbytes(src1);
@@ -395,8 +397,8 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
 
             wgpu::CommandEncoder encoder = device.CreateCommandEncoder();
             encoder.CopyBufferToBuffer(
-                ctx->mul_mat_params_host_buf, 0, 
-                ctx->mul_mat_params_dev_buf, 0, 
+                ctx->mul_mat_params_host_buf, 0,
+                ctx->mul_mat_params_dev_buf, 0,
                 ctx->mul_mat_params_dev_buf.GetSize()
             );
             wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
@@ -417,7 +419,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){
             return false;
     }
 }
- 
+
 static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     WEBGPU_LOG_DEBUG("ggml_backend_webgpu_graph_compute(" << cgraph->n_nodes << " nodes)");
 
@@ -517,13 +519,13 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
 
     std::lock_guard<std::mutex> lock(webgpu_ctx->mutex);
 
-    if (webgpu_ctx->get_tensor_staging_buf == nullptr || 
+    if (webgpu_ctx->get_tensor_staging_buf == nullptr ||
         webgpu_ctx->get_tensor_staging_buf.GetSize() < final_size) {
         // Create a new staging buffer if it doesn't exist or is too small
         if (webgpu_ctx->get_tensor_staging_buf) {
             webgpu_ctx->get_tensor_staging_buf.Destroy();
         }
-        ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, final_size, 
+        ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, final_size,
             wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "get_tensor_staging_buf");
     }
 
@@ -577,7 +579,7 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
 
     wgpu::Buffer buf;
-    ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf, size, 
+    ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf, size,
         wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst, "allocated_buffer");
 
     ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf);
@@ -652,7 +654,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context webgpu_ctx) {
     constants[1].key = "bytes_per_thread";
     constants[1].value = webgpu_ctx->memset_bytes_per_thread;
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->memset_pipeline, wgsl_memset, "memset", constants);
-    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_dev_buf, 
+    ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_dev_buf,
         3 * sizeof(uint32_t), // 3 parameters: buffer size, offset, value
         wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst, "memset_params_dev_buf");
     ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_host_buf,
@@ -679,7 +681,7 @@ static void ggml_webgpu_init_cpy_pipeline(webgpu_context webgpu_ctx) {
         wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc, "cpy_params_host_buf");
 }
 
-// TODO: Does this need to be thread safe? Is it only called once?
+// TODO: Make thread safe if multiple devices are used
 static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
     GGML_UNUSED(params);
 
@@ -696,7 +698,7 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co
         dev_desc.requiredLimits = &webgpu_ctx->limits;
         dev_desc.requiredFeatures = webgpu_ctx->features.features;
         dev_desc.requiredFeatureCount = webgpu_ctx->features.featureCount;
-        dev_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, 
+        dev_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous,
             [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
                 GGML_UNUSED(device);
                 GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason), message.data);
@@ -847,7 +849,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     device_ctx.device_name = std::string(info.device.data);
     device_ctx.device_desc = std::string(info.description.data);
 
-    GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n", 
+    GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n",
         info.vendorID, info.vendor.data, info.architecture.data, info.deviceID, info.device.data, info.description.data);
 
     // See GGML Backend Device Interface section
@@ -902,4 +904,4 @@ ggml_backend_t ggml_backend_webgpu_init(void) {
     return ggml_backend_webgpu_device_init(dev, nullptr);
 }
 
-GGML_BACKEND_DL_IMPL(ggml_backend_webgpu_reg)
\ No newline at end of file
+GGML_BACKEND_DL_IMPL(ggml_backend_webgpu_reg)
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
index daec8fe87dfda..962dcd6b170ed 100755
--- a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
@@ -1,13 +1,16 @@
 import os
 import argparse
 
+
 def escape_triple_quotes(wgsl):
     # Simple defense in case of embedded """
     return wgsl.replace('"""', '\\"""')
 
+
 def to_cpp_string_literal(varname, content):
     return f'const char* wgsl_{varname} = R"({content})";\n'
 
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('--input', required=True)
@@ -27,5 +30,6 @@ def main():
             out.write(to_cpp_string_literal(varname, content))
             out.write('\n')
 
+
 if __name__ == '__main__':
     main()
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
index 7a7a42f23d9ae..054aab566f96b 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl
@@ -41,7 +41,7 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
     let src02_idx = dst2_idx / params.broadcast2; // src0 may also be broadcast along the second dimension
     let src12_idx = dst2_idx; // src1 is not broadcast
 
-    let dst2_rem = dst3_rem % dst2_stride; 
+    let dst2_rem = dst3_rem % dst2_stride;
 
     let row = dst2_rem / params.n; // output row
     let col = dst2_rem % params.n; // output column
@@ -53,4 +53,4 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
         sum = sum + src0[src0_idx] * src1[src1_idx];
     }
     dst[dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum;
-}
\ No newline at end of file
+}

From 8b31513f4b9ba3010c567e844da1d52657507309 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Tue, 15 Jul 2025 16:59:10 -0700
Subject: [PATCH 24/25] Fix name

---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 5366fd6a2ec34..c5abc69343357 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -846,7 +846,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
 
     static ggml_backend_webgpu_device_context device_ctx;
     device_ctx.webgpu_ctx = ctx;
-    device_ctx.device_name = std::string(info.device.data);
+    device_ctx.device_name = GGML_WEBGPU_NAME;
     device_ctx.device_desc = std::string(info.description.data);
 
     GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n",

From 22f8dd4ea5b49373a9332173c53e1a8e65f3e9a4 Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Wed, 16 Jul 2025 07:44:05 -0700
Subject: [PATCH 25/25] Fix macos dawn prefix path

---
 .github/workflows/build.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 506a1b12f4d28..7de6eebe50d39 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -188,8 +188,7 @@ jobs:
       - name: Build
         id: cmake_build
         run: |
-          sysctl -a
-          export Dawn_DIR=dawn/lib64/cmake/Dawn
+          export CMAKE_PREFIX_PATH=dawn
           cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)