From 63eb6462975618ae0d47cc5d1ff4be6942f95b2b Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 19 May 2025 17:13:42 -0700 Subject: [PATCH 01/25] Minimal setup of webgpu backend with dawn. Just prints out the adapter and segfaults --- ggml/CMakeLists.txt | 3 + ggml/include/ggml-webgpu.h | 18 ++++ ggml/src/CMakeLists.txt | 1 + ggml/src/ggml-backend-reg.cpp | 7 ++ ggml/src/ggml-webgpu/CMakeLists.txt | 10 +++ ggml/src/ggml-webgpu/ggml-webgpu.cpp | 118 +++++++++++++++++++++++++++ 6 files changed, 157 insertions(+) create mode 100644 ggml/include/ggml-webgpu.h create mode 100644 ggml/src/ggml-webgpu/CMakeLists.txt create mode 100644 ggml/src/ggml-webgpu/ggml-webgpu.cpp diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 4746d5cb76c08..e8a2a8ad77c86 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -180,6 +180,8 @@ option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF) option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) option(GGML_KOMPUTE "ggml: use Kompute" OFF) +option(GGML_WEBGPU "ggml: use WebGPU" OFF) +option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF) option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF) option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF) @@ -270,6 +272,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-rpc.h include/ggml-sycl.h include/ggml-vulkan.h + include/ggml-webgpu.h include/gguf.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") diff --git a/ggml/include/ggml-webgpu.h b/ggml/include/ggml-webgpu.h new file mode 100644 index 0000000000000..93d5329adeb9b --- /dev/null +++ b/ggml/include/ggml-webgpu.h @@ -0,0 +1,18 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_WEBGPU_NAME "WebGPU" + +GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index ddea5ad3891e5..0a8f4dca518e0 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -312,6 +312,7 @@ ggml_add_backend(MUSA) ggml_add_backend(RPC) ggml_add_backend(SYCL) ggml_add_backend(Vulkan) +ggml_add_backend(WebGPU) ggml_add_backend(OpenCL) foreach (target ggml-base ggml) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 405d8e31514b5..da8d70dbaf2ee 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -45,6 +45,10 @@ #include "ggml-vulkan.h" #endif +#ifdef GGML_USE_WEBGPU +#include "ggml-webgpu.h" +#endif + #ifdef GGML_USE_OPENCL #include "ggml-opencl.h" #endif @@ -172,6 +176,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_VULKAN register_backend(ggml_backend_vk_reg()); #endif +#ifdef GGML_USE_WEBGPU + register_backend(ggml_backend_webgpu_reg()); +#endif #ifdef GGML_USE_OPENCL register_backend(ggml_backend_opencl_reg()); #endif diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt new file mode 100644 index 0000000000000..c8536cc7e7845 --- /dev/null +++ b/ggml/src/ggml-webgpu/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.13) + +find_package(Dawn REQUIRED) + +ggml_add_backend_library(ggml-webgpu + ggml-webgpu.cpp + ../../include/ggml-webgpu.h + ) + +target_link_libraries(ggml-webgpu PRIVATE dawn::webgpu_dawn) \ No newline at end of file diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp new file mode 100644 index 0000000000000..c109f3889b6ee --- /dev/null +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -0,0 +1,118 @@ +#include "ggml-webgpu.h" + +#include +#include + +#include "ggml-impl.h" +#include "ggml-backend-impl.h" + +#include +#include + +static ggml_backend_i ggml_backend_webgpu_interface = { + /* .get_name = */ NULL, + /* .free = */ NULL, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, +}; + +static ggml_guid_t ggml_backend_webgpu_guid(void) { + static ggml_guid guid = { 0x9a, 0x5f, 0x3c, 0x2d, 0xb7, 0x1e, 0x47, 0xa1, 0x92, 0xcf, 0x16, 0x44, 0x58, 0xee, 0x90, 0x2b }; + return &guid; +} + +// necessary?? +ggml_backend_t ggml_backend_webgpu_init() { + ggml_backend_t webgpu_backend = new ggml_backend { + /* .guid = */ ggml_backend_webgpu_guid(), + /* .interface = */ ggml_backend_webgpu_interface, + /* .device = */ NULL, + /* .context = */ NULL, + }; + + return webgpu_backend; +} + +static const char * ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + return GGML_WEBGPU_NAME; +} + +// Stub for now +static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + return 1; +} + +// Stub for now +static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t reg, size_t device) { + static std::vector devices; + return devices[device]; + +} + +static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = { + /* .get_name = */ ggml_backend_webgpu_reg_get_name, + /* .get_device_count = */ ggml_backend_webgpu_reg_get_device_count, + /* .get_device = */ ggml_backend_webgpu_reg_get_device, + /* .get_proc_address = */ NULL, +}; + +ggml_backend_reg_t ggml_backend_webgpu_reg() { + static ggml_backend_reg reg = { + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_webgpu_reg_i, + /* .context = */ nullptr, + }; + // need to init webgpu here + wgpu::InstanceDescriptor instanceDescriptor{}; + instanceDescriptor.capabilities.timedWaitAnyEnable = true; + wgpu::Instance instance = wgpu::CreateInstance(&instanceDescriptor); + if (instance == nullptr) { + std::cerr << "Instance creation failed!\n"; + return nullptr; + } + // Synchronously request the adapter. + wgpu::RequestAdapterOptions options = {}; + wgpu::Adapter adapter; + + auto callback = [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char *message, void *userdata) { + if (status != wgpu::RequestAdapterStatus::Success) { + std::cerr << "Failed to get an adapter:" << message; + return; + } + *static_cast(userdata) = adapter; + }; + + + auto callbackMode = wgpu::CallbackMode::WaitAnyOnly; + void *userdata = &adapter; + instance.WaitAny(instance.RequestAdapter(&options, callbackMode, callback, userdata), UINT64_MAX); + if (adapter == nullptr) { + std::cerr << "RequestAdapter failed!\n"; + return nullptr; + } + + wgpu::DawnAdapterPropertiesPowerPreference power_props{}; + + wgpu::AdapterInfo info{}; + info.nextInChain = &power_props; + + adapter.GetInfo(&info); + std::cout << "VendorID: " << std::hex << info.vendorID << std::dec << "\n"; + std::cout << "Vendor: " << info.vendor << "\n"; + std::cout << "Architecture: " << info.architecture << "\n"; + std::cout << "DeviceID: " << std::hex << info.deviceID << std::dec << "\n"; + std::cout << "Name: " << info.device << "\n"; + std::cout << "Driver description: " << info.description << "\n"; + return ® +} From c0a810e816b79ac25d28583547e0a436dc3fbc19 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 20 May 2025 12:18:10 -0700 Subject: [PATCH 02/25] Initialize webgpu device --- ggml/src/ggml-webgpu/CMakeLists.txt | 8 +- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 119 ++++++++++++++++++--------- 2 files changed, 86 insertions(+), 41 deletions(-) diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt index c8536cc7e7845..c72b17c82a828 100644 --- a/ggml/src/ggml-webgpu/CMakeLists.txt +++ b/ggml/src/ggml-webgpu/CMakeLists.txt @@ -7,4 +7,10 @@ ggml_add_backend_library(ggml-webgpu ../../include/ggml-webgpu.h ) -target_link_libraries(ggml-webgpu PRIVATE dawn::webgpu_dawn) \ No newline at end of file +if (GGML_WEBGPU_DEBUG) + message(STATUS "GGML_WEBGPU_DEBUG is ON") + target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1) +endif() + +target_link_libraries(ggml-webgpu PRIVATE dawn::webgpu_dawn) + diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index c109f3889b6ee..f340f56940af2 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -9,6 +9,84 @@ #include #include +#ifdef GGML_WEBGPU_DEBUG +#define WEBGPU_LOG_DEBUG(msg) std::cout << msg << std::endl +#else +#define WEBGPU_LOG_DEBUG(msg) ((void) 0) +#endif // GGML_WEBGPU_DEBUG + + +struct webgpu_context { + wgpu::Instance instance; + // an adapter can only be used to create one device + wgpu::Adapter adapter; + // we only support one device for now + wgpu::Device device; +}; + +static bool webgpu_context_initialized = false; +static webgpu_context webgpu_ctx; + +static void ggml_webgpu_context_init() { + if (webgpu_context_initialized) { + return; + } + WEBGPU_LOG_DEBUG("ggml_webgpu_context_init()"); + + wgpu::InstanceDescriptor instanceDescriptor{}; + instanceDescriptor.capabilities.timedWaitAnyEnable = true; + webgpu_ctx.instance = wgpu::CreateInstance(&instanceDescriptor); + GGML_ASSERT(webgpu_ctx.instance != nullptr); + + wgpu::RequestAdapterOptions options = {}; + wgpu::Adapter adapter; + + auto callback = [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char *message, void *userdata) { + if (status != wgpu::RequestAdapterStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message); + return; + } + *static_cast(userdata) = adapter; + }; + + auto callbackMode = wgpu::CallbackMode::WaitAnyOnly; + void *userdata = &webgpu_ctx.adapter; + webgpu_ctx.instance.WaitAny(webgpu_ctx.instance.RequestAdapter(&options, callbackMode, callback, userdata), UINT64_MAX); + GGML_ASSERT(webgpu_ctx.adapter != nullptr); + + wgpu::DeviceDescriptor deviceDescriptor; + deviceDescriptor.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, + [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) { + GGML_UNUSED(device); + GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast(reason), message.data); + }); + deviceDescriptor.SetUncapturedErrorCallback( + [](const wgpu::Device& device, wgpu::ErrorType reason, wgpu::StringView message) { + GGML_UNUSED(device); + GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), message.data); + }); + webgpu_ctx.instance.WaitAny(webgpu_ctx.adapter.RequestDevice(&deviceDescriptor, callbackMode, + [](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) { + if (status != wgpu::RequestDeviceStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data); + return; + } + webgpu_ctx.device = std::move(device); + }), + UINT64_MAX + ); + GGML_ASSERT(webgpu_ctx.device != nullptr); + + wgpu::DawnAdapterPropertiesPowerPreference power_props{}; + wgpu::AdapterInfo info{}; + info.nextInChain = &power_props; + webgpu_ctx.adapter.GetInfo(&info); + GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n", + info.vendorID, info.vendor.data, info.architecture.data, info.deviceID, info.device.data, info.description.data); + + webgpu_context_initialized = true; +} + static ggml_backend_i ggml_backend_webgpu_interface = { /* .get_name = */ NULL, /* .free = */ NULL, @@ -74,45 +152,6 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() { /* .context = */ nullptr, }; // need to init webgpu here - wgpu::InstanceDescriptor instanceDescriptor{}; - instanceDescriptor.capabilities.timedWaitAnyEnable = true; - wgpu::Instance instance = wgpu::CreateInstance(&instanceDescriptor); - if (instance == nullptr) { - std::cerr << "Instance creation failed!\n"; - return nullptr; - } - // Synchronously request the adapter. - wgpu::RequestAdapterOptions options = {}; - wgpu::Adapter adapter; - - auto callback = [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char *message, void *userdata) { - if (status != wgpu::RequestAdapterStatus::Success) { - std::cerr << "Failed to get an adapter:" << message; - return; - } - *static_cast(userdata) = adapter; - }; - - - auto callbackMode = wgpu::CallbackMode::WaitAnyOnly; - void *userdata = &adapter; - instance.WaitAny(instance.RequestAdapter(&options, callbackMode, callback, userdata), UINT64_MAX); - if (adapter == nullptr) { - std::cerr << "RequestAdapter failed!\n"; - return nullptr; - } - - wgpu::DawnAdapterPropertiesPowerPreference power_props{}; - - wgpu::AdapterInfo info{}; - info.nextInChain = &power_props; - - adapter.GetInfo(&info); - std::cout << "VendorID: " << std::hex << info.vendorID << std::dec << "\n"; - std::cout << "Vendor: " << info.vendor << "\n"; - std::cout << "Architecture: " << info.architecture << "\n"; - std::cout << "DeviceID: " << std::hex << info.deviceID << std::dec << "\n"; - std::cout << "Name: " << info.device << "\n"; - std::cout << "Driver description: " << info.description << "\n"; + ggml_webgpu_context_init(); return ® } From e50335c4c5a718b802545c557b0c1d9ac3977b22 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 20 May 2025 18:03:45 -0700 Subject: [PATCH 03/25] Making progress on setting up the backend --- ggml/include/ggml-webgpu.h | 2 - ggml/src/ggml-webgpu/CMakeLists.txt | 1 - ggml/src/ggml-webgpu/ggml-webgpu.cpp | 252 ++++++++++++++++++--------- 3 files changed, 172 insertions(+), 83 deletions(-) diff --git a/ggml/include/ggml-webgpu.h b/ggml/include/ggml-webgpu.h index 93d5329adeb9b..cf6cb451648e9 100644 --- a/ggml/include/ggml-webgpu.h +++ b/ggml/include/ggml-webgpu.h @@ -9,8 +9,6 @@ extern "C" { #define GGML_WEBGPU_NAME "WebGPU" -GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void); - GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void); #ifdef __cplusplus diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt index c72b17c82a828..27d4472ea65fd 100644 --- a/ggml/src/ggml-webgpu/CMakeLists.txt +++ b/ggml/src/ggml-webgpu/CMakeLists.txt @@ -8,7 +8,6 @@ ggml_add_backend_library(ggml-webgpu ) if (GGML_WEBGPU_DEBUG) - message(STATUS "GGML_WEBGPU_DEBUG is ON") target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1) endif() diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index f340f56940af2..3518dff0ca6a9 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -15,32 +15,159 @@ #define WEBGPU_LOG_DEBUG(msg) ((void) 0) #endif // GGML_WEBGPU_DEBUG +// TODO: find a better way to get the memory available +#define WEBGPU_MAX_BUFFERS 32 -struct webgpu_context { +// When registering the backend, we initialize the WebGPU instance. +struct webgpu_reg_context { wgpu::Instance instance; - // an adapter can only be used to create one device + size_t device_count; + const char * name; +}; + +// When getting the (ggml) device, we create a WebGPU adapter and its associated WebGPU device. +struct webgpu_device_context { + // An adapter can only be used to create one device wgpu::Adapter adapter; - // we only support one device for now wgpu::Device device; + wgpu::StringView device_name; + wgpu::StringView device_desc; }; -static bool webgpu_context_initialized = false; -static webgpu_context webgpu_ctx; +struct webgpu_backend_context { + wgpu::Device device; +}; -static void ggml_webgpu_context_init() { - if (webgpu_context_initialized) { - return; - } - WEBGPU_LOG_DEBUG("ggml_webgpu_context_init()"); +static ggml_backend_i ggml_backend_webgpu_i = { + /* .get_name = */ NULL, + /* .free = */ NULL, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, +}; - wgpu::InstanceDescriptor instanceDescriptor{}; - instanceDescriptor.capabilities.timedWaitAnyEnable = true; - webgpu_ctx.instance = wgpu::CreateInstance(&instanceDescriptor); - GGML_ASSERT(webgpu_ctx.instance != nullptr); +static ggml_guid_t ggml_backend_webgpu_guid(void) { + static const char * guid_str = "__ggml_webgpu :)"; + return reinterpret_cast((void *)guid_str); +} - wgpu::RequestAdapterOptions options = {}; - wgpu::Adapter adapter; +static const char * ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) { + webgpu_device_context * ctx = static_cast(dev->context); + return ctx->device_name.data; +} + +static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_t dev) { + webgpu_device_context * ctx = static_cast(dev->context); + return ctx->device_desc.data; +} + +static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + webgpu_device_context * ctx = static_cast(dev->context); + wgpu::Limits limits; + ctx->device.GetLimits(&limits); + // TODO: what do we actually want to return here? + *free = limits.maxBufferSize * WEBGPU_MAX_BUFFERS; + *total = limits.maxBufferSize * WEBGPU_MAX_BUFFERS; +} + +static enum ggml_backend_dev_type ggml_backend_webgpu_device_get_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_GPU; +} + +static void ggml_backend_webgpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = ggml_backend_webgpu_device_get_name(dev); + props->description = ggml_backend_webgpu_device_get_description(dev); + props->type = ggml_backend_webgpu_device_get_type(dev); + ggml_backend_webgpu_device_get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, + /* .host_buffer = */ true, // maybe? not sure what this means yet + /* .buffer_from_host_ptr = */ false, + /* .events = */ false, + }; +} + +// TODO: Does this need to be thread safe? Is it only called once? +static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) { + GGML_UNUSED(params); + + webgpu_device_context * dev_ctx = static_cast(dev->context); + static webgpu_backend_context backend_ctx; + backend_ctx.device = dev_ctx->device; + + static ggml_backend backend = { + /* .guid = */ ggml_backend_webgpu_guid(), + /* .interface = */ ggml_backend_webgpu_i, + /* .device = */ dev, + /* .context = */ &backend_ctx, + }; + return &backend; +} + +static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggml_backend_dev_t dev) { + static struct ggml_backend_buffer_type ggml_backend_buffer_type_webgpu = { + /* .iface = */ { + /* .get_name = */ NULL, + /* .alloc_buffer = */ NULL, + /* .get_alignment = */ NULL, + /* .get_max_size = */ NULL, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .is_host = */ NULL, + }, + /* .device = */ dev, + /* .context = */ NULL, + }; + return &ggml_backend_buffer_type_webgpu; +} + +static struct ggml_backend_device_i ggml_backend_webgpu_device_i = { + /* .get_name = */ ggml_backend_webgpu_device_get_name, + /* .get_description = */ ggml_backend_webgpu_device_get_description, + /* .get_memory = */ ggml_backend_webgpu_device_get_memory, + /* .get_type = */ ggml_backend_webgpu_device_get_type, + /* .get_props = */ ggml_backend_webgpu_device_get_props, + /* .init_backend = */ ggml_backend_webgpu_device_init, + /* .get_buffer_type = */ ggml_backend_webgpu_device_get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ NULL, + /* .supports_op = */ NULL, + /* .supports_buft = */ NULL, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, +}; + +static const char * ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) { + webgpu_reg_context * ctx = static_cast(reg->context); + return ctx->name; +} + +static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) { + webgpu_reg_context * ctx = static_cast(reg->context); + return ctx->device_count; +} + +// TODO: Does this need to be thread safe? Is it only called once? +// Only one device is supported for now +static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t reg, size_t index) { + GGML_ASSERT(index == 0); + WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()"); + + webgpu_reg_context * reg_ctx = static_cast(reg->context); + static webgpu_device_context device_ctx; + + wgpu::RequestAdapterOptions options = {}; auto callback = [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char *message, void *userdata) { if (status != wgpu::RequestAdapterStatus::Success) { GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message); @@ -48,11 +175,10 @@ static void ggml_webgpu_context_init() { } *static_cast(userdata) = adapter; }; - auto callbackMode = wgpu::CallbackMode::WaitAnyOnly; - void *userdata = &webgpu_ctx.adapter; - webgpu_ctx.instance.WaitAny(webgpu_ctx.instance.RequestAdapter(&options, callbackMode, callback, userdata), UINT64_MAX); - GGML_ASSERT(webgpu_ctx.adapter != nullptr); + void *userdata = &device_ctx.adapter; + reg_ctx->instance.WaitAny(reg_ctx->instance.RequestAdapter(&options, callbackMode, callback, userdata), UINT64_MAX); + GGML_ASSERT(device_ctx.adapter != nullptr); wgpu::DeviceDescriptor deviceDescriptor; deviceDescriptor.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, @@ -65,77 +191,31 @@ static void ggml_webgpu_context_init() { GGML_UNUSED(device); GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), message.data); }); - webgpu_ctx.instance.WaitAny(webgpu_ctx.adapter.RequestDevice(&deviceDescriptor, callbackMode, + reg_ctx->instance.WaitAny(device_ctx.adapter.RequestDevice(&deviceDescriptor, callbackMode, [](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) { if (status != wgpu::RequestDeviceStatus::Success) { GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data); return; } - webgpu_ctx.device = std::move(device); + device_ctx.device = std::move(device); }), UINT64_MAX ); - GGML_ASSERT(webgpu_ctx.device != nullptr); + GGML_ASSERT(device_ctx.device != nullptr); - wgpu::DawnAdapterPropertiesPowerPreference power_props{}; wgpu::AdapterInfo info{}; - info.nextInChain = &power_props; - webgpu_ctx.adapter.GetInfo(&info); + device_ctx.adapter.GetInfo(&info); + device_ctx.device_name = info.device; + device_ctx.device_desc = info.description; GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n", info.vendorID, info.vendor.data, info.architecture.data, info.deviceID, info.device.data, info.description.data); - webgpu_context_initialized = true; -} - -static ggml_backend_i ggml_backend_webgpu_interface = { - /* .get_name = */ NULL, - /* .free = */ NULL, - /* .set_tensor_async = */ NULL, - /* .get_tensor_async = */ NULL, - /* .cpy_tensor_async = */ NULL, - /* .synchronize = */ NULL, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ NULL, - /* .event_record = */ NULL, - /* .event_wait = */ NULL, -}; - -static ggml_guid_t ggml_backend_webgpu_guid(void) { - static ggml_guid guid = { 0x9a, 0x5f, 0x3c, 0x2d, 0xb7, 0x1e, 0x47, 0xa1, 0x92, 0xcf, 0x16, 0x44, 0x58, 0xee, 0x90, 0x2b }; - return &guid; -} - -// necessary?? -ggml_backend_t ggml_backend_webgpu_init() { - ggml_backend_t webgpu_backend = new ggml_backend { - /* .guid = */ ggml_backend_webgpu_guid(), - /* .interface = */ ggml_backend_webgpu_interface, - /* .device = */ NULL, - /* .context = */ NULL, + static ggml_backend_device device = { + /* .iface = */ ggml_backend_webgpu_device_i, + /* .reg = */ reg, + /* .context = */ &device_ctx, }; - - return webgpu_backend; -} - -static const char * ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) { - GGML_UNUSED(reg); - return GGML_WEBGPU_NAME; -} - -// Stub for now -static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) { - GGML_UNUSED(reg); - return 1; -} - -// Stub for now -static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t reg, size_t device) { - static std::vector devices; - return devices[device]; - + return &device; } static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = { @@ -145,13 +225,25 @@ static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = { /* .get_proc_address = */ NULL, }; +// TODO: Does this need to be thread safe? Is it only called once? ggml_backend_reg_t ggml_backend_webgpu_reg() { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg()"); + + static webgpu_reg_context ctx; + ctx.name = GGML_WEBGPU_NAME; + ctx.device_count = 1; + + wgpu::InstanceDescriptor instanceDescriptor{}; + instanceDescriptor.capabilities.timedWaitAnyEnable = true; + ctx.instance = wgpu::CreateInstance(&instanceDescriptor); + GGML_ASSERT(ctx.instance != nullptr); + static ggml_backend_reg reg = { /* .api_version = */ GGML_BACKEND_API_VERSION, /* .iface = */ ggml_backend_webgpu_reg_i, - /* .context = */ nullptr, + /* .context = */ &ctx, }; - // need to init webgpu here - ggml_webgpu_context_init(); return ® } + +GGML_BACKEND_DL_IMPL(ggml_backend_webgpu_reg) \ No newline at end of file From b17b1645c9de7af1cc7abccf0afac707d2ef6ed4 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 21 May 2025 10:47:44 -0700 Subject: [PATCH 04/25] Finish more boilerplate/utility functions --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 159 ++++++++++++++++++++------- 1 file changed, 120 insertions(+), 39 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 3518dff0ca6a9..aa56e26a4ae2d 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -19,14 +19,14 @@ #define WEBGPU_MAX_BUFFERS 32 // When registering the backend, we initialize the WebGPU instance. -struct webgpu_reg_context { +struct ggml_backend_webgpu_reg_context { wgpu::Instance instance; size_t device_count; const char * name; }; // When getting the (ggml) device, we create a WebGPU adapter and its associated WebGPU device. -struct webgpu_device_context { +struct ggml_backend_webgpu_device_context { // An adapter can only be used to create one device wgpu::Adapter adapter; wgpu::Device device; @@ -34,43 +34,28 @@ struct webgpu_device_context { wgpu::StringView device_desc; }; -struct webgpu_backend_context { +struct ggml_backend_webgpu_context { + std::string name; wgpu::Device device; }; -static ggml_backend_i ggml_backend_webgpu_i = { - /* .get_name = */ NULL, - /* .free = */ NULL, - /* .set_tensor_async = */ NULL, - /* .get_tensor_async = */ NULL, - /* .cpy_tensor_async = */ NULL, - /* .synchronize = */ NULL, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ NULL, - /* .event_record = */ NULL, - /* .event_wait = */ NULL, -}; - static ggml_guid_t ggml_backend_webgpu_guid(void) { static const char * guid_str = "__ggml_webgpu :)"; return reinterpret_cast((void *)guid_str); } static const char * ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) { - webgpu_device_context * ctx = static_cast(dev->context); + ggml_backend_webgpu_device_context * ctx = static_cast(dev->context); return ctx->device_name.data; } static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_t dev) { - webgpu_device_context * ctx = static_cast(dev->context); + ggml_backend_webgpu_device_context * ctx = static_cast(dev->context); return ctx->device_desc.data; } static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - webgpu_device_context * ctx = static_cast(dev->context); + ggml_backend_webgpu_device_context * ctx = static_cast(dev->context); wgpu::Limits limits; ctx->device.GetLimits(&limits); // TODO: what do we actually want to return here? @@ -96,13 +81,44 @@ static void ggml_backend_webgpu_device_get_props(ggml_backend_dev_t dev, struct }; } +static const char * ggml_backend_webgpu_name(ggml_backend_t backend) { + ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context; + return ctx->name.c_str(); +} + +static void ggml_backend_webgpu_free(ggml_backend_t backend) { + ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context; + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")"); + + // TODO: cleanup +} + +static ggml_backend_i ggml_backend_webgpu_i = { + /* .get_name = */ ggml_backend_webgpu_name, + /* .free = */ ggml_backend_webgpu_free, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, // TODO + /* .graph_compute = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, +}; + // TODO: Does this need to be thread safe? Is it only called once? static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) { GGML_UNUSED(params); - webgpu_device_context * dev_ctx = static_cast(dev->context); + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_init()"); - static webgpu_backend_context backend_ctx; + ggml_backend_webgpu_device_context * dev_ctx = static_cast(dev->context); + + static ggml_backend_webgpu_context backend_ctx; + backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + std::string(dev_ctx->device_name.data); backend_ctx.device = dev_ctx->device; static ggml_backend backend = { @@ -114,20 +130,85 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co return &backend; } +static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); + return ctx->device_name.data; +} + +static size_t ggml_backend_webgpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); + wgpu::Limits limits; + ctx->device.GetLimits(&limits); + return limits.minStorageBufferOffsetAlignment; +} + +static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); + wgpu::Limits limits; + ctx->device.GetLimits(&limits); + return limits.maxBufferSize; +} + static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggml_backend_dev_t dev) { - static struct ggml_backend_buffer_type ggml_backend_buffer_type_webgpu = { + static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type = { /* .iface = */ { - /* .get_name = */ NULL, - /* .alloc_buffer = */ NULL, - /* .get_alignment = */ NULL, - /* .get_max_size = */ NULL, + /* .get_name = */ ggml_backend_webgpu_buffer_type_get_name, + /* .alloc_buffer = */ NULL, // TODO + /* .get_alignment = */ ggml_backend_webgpu_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_webgpu_buffer_type_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ NULL, + /* .is_host = */ NULL, // defaults to false }, /* .device = */ dev, /* .context = */ NULL, }; - return &ggml_backend_buffer_type_webgpu; + + return &ggml_backend_webgpu_buffer_type; +} + +static const char * ggml_backend_webgpu_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return GGML_WEBGPU_NAME "_Host"; +} + +// WebGPU doesn't specify a memory map alignment like Vulkan, so we use the same value as the storage buffer alignment +static size_t ggml_backend_webgpu_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); + wgpu::Limits limits; + ctx->device.GetLimits(&limits); + return limits.minStorageBufferOffsetAlignment; +} + +static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_host_buffer_type(ggml_backend_dev_t dev) { + static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_webgpu_host_buffer_type_name, + /* .alloc_buffer = */ NULL, // TODO + /* .get_alignment = */ ggml_backend_webgpu_host_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .device = */ dev, + /* .context = */ NULL, + }; + + return &ggml_backend_webgpu_buffer_type_host; +} + +static bool ggml_backend_webgpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + GGML_UNUSED(dev); + return buft->iface.get_name == ggml_backend_webgpu_buffer_type_get_name; +} + +static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + GGML_UNUSED(dev); + + // what should we support first? + switch (op->op) { + default: + return false; + } } static struct ggml_backend_device_i ggml_backend_webgpu_device_i = { @@ -138,10 +219,10 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = { /* .get_props = */ ggml_backend_webgpu_device_get_props, /* .init_backend = */ ggml_backend_webgpu_device_init, /* .get_buffer_type = */ ggml_backend_webgpu_device_get_buffer_type, - /* .get_host_buffer_type = */ NULL, + /* .get_host_buffer_type = */ ggml_backend_webgpu_device_get_host_buffer_type, /* .buffer_from_host_ptr = */ NULL, - /* .supports_op = */ NULL, - /* .supports_buft = */ NULL, + /* .supports_op = */ ggml_backend_webgpu_device_supports_op, + /* .supports_buft = */ ggml_backend_webgpu_device_supports_buft, /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, @@ -149,12 +230,12 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = { }; static const char * ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) { - webgpu_reg_context * ctx = static_cast(reg->context); + ggml_backend_webgpu_reg_context * ctx = static_cast(reg->context); return ctx->name; } static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) { - webgpu_reg_context * ctx = static_cast(reg->context); + ggml_backend_webgpu_reg_context * ctx = static_cast(reg->context); return ctx->device_count; } @@ -164,8 +245,8 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t GGML_ASSERT(index == 0); WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()"); - webgpu_reg_context * reg_ctx = static_cast(reg->context); - static webgpu_device_context device_ctx; + ggml_backend_webgpu_reg_context * reg_ctx = static_cast(reg->context); + static ggml_backend_webgpu_device_context device_ctx; wgpu::RequestAdapterOptions options = {}; auto callback = [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char *message, void *userdata) { @@ -229,7 +310,7 @@ static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = { ggml_backend_reg_t ggml_backend_webgpu_reg() { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg()"); - static webgpu_reg_context ctx; + static ggml_backend_webgpu_reg_context ctx; ctx.name = GGML_WEBGPU_NAME; ctx.device_count = 1; From e7071d14f381a172a57eab1f02c455d0c3694cbd Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 21 May 2025 12:13:03 -0700 Subject: [PATCH 05/25] Organize file and work on alloc buffer --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 242 ++++++++++++++++++++------- 1 file changed, 181 insertions(+), 61 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index aa56e26a4ae2d..e675a115fadb8 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -18,6 +18,11 @@ // TODO: find a better way to get the memory available #define WEBGPU_MAX_BUFFERS 32 +// TODO: copied from Vulkan for now, not sure what it's used for +static void * const webgpu_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT + +/* Struct definitions */ + // When registering the backend, we initialize the WebGPU instance. struct ggml_backend_webgpu_reg_context { wgpu::Instance instance; @@ -39,11 +44,169 @@ struct ggml_backend_webgpu_context { wgpu::Device device; }; -static ggml_guid_t ggml_backend_webgpu_guid(void) { - static const char * guid_str = "__ggml_webgpu :)"; - return reinterpret_cast((void *)guid_str); +struct ggml_backend_webgpu_buffer_context { + wgpu::Buffer buffer; + + ggml_backend_webgpu_buffer_context(wgpu::Buffer buf) : + buffer(buf) { + } +}; + +/* End struct definitions */ + +/** GGML Backend Interface */ + +static const char * ggml_backend_webgpu_name(ggml_backend_t backend) { + ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context; + return ctx->name.c_str(); +} + +static void ggml_backend_webgpu_free(ggml_backend_t backend) { + ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context; + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")"); + + // TODO: cleanup +} + +static ggml_backend_i ggml_backend_webgpu_i = { + /* .get_name = */ ggml_backend_webgpu_name, + /* .free = */ ggml_backend_webgpu_free, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, // TODO + /* .graph_compute = */ NULL, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, +}; + +/* End GGML Backend Interface */ + +/* GGML Backend Buffer Interface */ + +// TODO +static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { + GGML_UNUSED(buffer); +} + +// TODO: what to return here? +static void * ggml_backend_webgpu_buffer_get_base(ggml_backend_buffer_t buffer) { + GGML_UNUSED(buffer); + return webgpu_ptr_base; +} + +// TODO +static enum ggml_status ggml_backend_webgpu_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")"); + return GGML_STATUS_SUCCESS; +} + +// TODO +static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")"); +} + +// TODO +static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); +} + +// TODO +static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + WEBGPU_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); +} + +// TODO +static bool ggml_backend_webgpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + GGML_UNUSED(buffer); + GGML_UNUSED(src); + GGML_UNUSED(dst); + + return true; +} + +// TODO +static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + GGML_UNUSED(buffer); + GGML_UNUSED(value); +} + +static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = { + /* .free_buffer = */ ggml_backend_webgpu_buffer_free_buffer, + /* .get_base = */ ggml_backend_webgpu_buffer_get_base, + /* .init_tensor = */ ggml_backend_webgpu_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_webgpu_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_webgpu_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_webgpu_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_webgpu_buffer_cpy_tensor, + /* .clear = */ ggml_backend_webgpu_buffer_clear, + /* .reset = */ NULL, +}; + +/* End GGML Backend Buffer Interface */ + +/* GGML Backend Buffer Type Interface */ + +static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); + return ctx->device_name.data; } +static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_alloc_buffer(" << size << ")"); + ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); + + wgpu::BufferDescriptor buf_desc; + buf_desc.mappedAtCreation = false; + buf_desc.size = size; + buf_desc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst; + // TODO: error handling + wgpu::Buffer buf = ctx->device.CreateBuffer(&buf_desc); + + ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(buf); + + return ggml_backend_buffer_init(buft, ggml_backend_webgpu_buffer_interface, buf_ctx, size); +} + +static size_t ggml_backend_webgpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); + wgpu::Limits limits; + ctx->device.GetLimits(&limits); + return limits.minStorageBufferOffsetAlignment; +} + +static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); + wgpu::Limits limits; + ctx->device.GetLimits(&limits); + return limits.maxBufferSize; +} + +/* End GGML Backend Buffer Type Interface */ + +/* GGML Backend Host Buffer Type Interface */ + +static const char * ggml_backend_webgpu_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return GGML_WEBGPU_NAME "_Host"; +} + +// WebGPU doesn't specify a memory map alignment like Vulkan, so we use the same value as the storage buffer alignment +static size_t ggml_backend_webgpu_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); + wgpu::Limits limits; + ctx->device.GetLimits(&limits); + return limits.minStorageBufferOffsetAlignment; +} + +/* End GGML Backend Host Buffer Type Interface */ + +/* GGML Backend Device Interface */ + static const char * ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) { ggml_backend_webgpu_device_context * ctx = static_cast(dev->context); return ctx->device_name.data; @@ -81,35 +244,13 @@ static void ggml_backend_webgpu_device_get_props(ggml_backend_dev_t dev, struct }; } -static const char * ggml_backend_webgpu_name(ggml_backend_t backend) { - ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context; - return ctx->name.c_str(); -} - -static void ggml_backend_webgpu_free(ggml_backend_t backend) { - ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context; - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")"); - - // TODO: cleanup +static ggml_guid_t ggml_backend_webgpu_guid(void) { + static const char * guid_str = "__ggml_webgpu :)"; + return reinterpret_cast((void *)guid_str); } -static ggml_backend_i ggml_backend_webgpu_i = { - /* .get_name = */ ggml_backend_webgpu_name, - /* .free = */ ggml_backend_webgpu_free, - /* .set_tensor_async = */ NULL, - /* .get_tensor_async = */ NULL, - /* .cpy_tensor_async = */ NULL, - /* .synchronize = */ NULL, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, // TODO - /* .graph_compute = */ NULL, - /* .event_record = */ NULL, - /* .event_wait = */ NULL, -}; - // TODO: Does this need to be thread safe? Is it only called once? +// Implementation in GGML Backend Interface section static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) { GGML_UNUSED(params); @@ -130,30 +271,12 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co return &backend; } -static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); - return ctx->device_name.data; -} - -static size_t ggml_backend_webgpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); - wgpu::Limits limits; - ctx->device.GetLimits(&limits); - return limits.minStorageBufferOffsetAlignment; -} - -static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); - wgpu::Limits limits; - ctx->device.GetLimits(&limits); - return limits.maxBufferSize; -} - +// Implementation in GGML Backend Buffer Type Interface section static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggml_backend_dev_t dev) { static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type = { /* .iface = */ { /* .get_name = */ ggml_backend_webgpu_buffer_type_get_name, - /* .alloc_buffer = */ NULL, // TODO + /* .alloc_buffer = */ ggml_backend_webgpu_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_webgpu_buffer_type_get_alignment, /* .get_max_size = */ ggml_backend_webgpu_buffer_type_get_max_size, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes @@ -166,19 +289,8 @@ static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggm return &ggml_backend_webgpu_buffer_type; } -static const char * ggml_backend_webgpu_host_buffer_type_name(ggml_backend_buffer_type_t buft) { - GGML_UNUSED(buft); - return GGML_WEBGPU_NAME "_Host"; -} - -// WebGPU doesn't specify a memory map alignment like Vulkan, so we use the same value as the storage buffer alignment -static size_t ggml_backend_webgpu_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); - wgpu::Limits limits; - ctx->device.GetLimits(&limits); - return limits.minStorageBufferOffsetAlignment; -} +// Implementation in GGML Backend Host Buffer Type Interface static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_host_buffer_type(ggml_backend_dev_t dev) { static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type_host = { /* .iface = */ { @@ -229,6 +341,10 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = { /* .event_synchronize = */ NULL, }; +/* End GGML Backend Device Interface */ + +/* GGML Backend Registration Interface */ + static const char * ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) { ggml_backend_webgpu_reg_context * ctx = static_cast(reg->context); return ctx->name; @@ -241,6 +357,7 @@ static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) { // TODO: Does this need to be thread safe? Is it only called once? // Only one device is supported for now +// Implementation in GGML Backend Device Interface section static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t reg, size_t index) { GGML_ASSERT(index == 0); WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()"); @@ -299,6 +416,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t return &device; } + static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = { /* .get_name = */ ggml_backend_webgpu_reg_get_name, /* .get_device_count = */ ggml_backend_webgpu_reg_get_device_count, @@ -306,6 +424,8 @@ static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = { /* .get_proc_address = */ NULL, }; +/* End GGML Backend Registration Interface */ + // TODO: Does this need to be thread safe? Is it only called once? ggml_backend_reg_t ggml_backend_webgpu_reg() { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg()"); From c9a53d27550c025249f6122567919a02517816fd Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 21 May 2025 17:42:43 -0700 Subject: [PATCH 06/25] Add webgpu_context to prepare for actually running some shaders --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 159 +++++++++++++++------------ 1 file changed, 86 insertions(+), 73 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index e675a115fadb8..6213e694ed326 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -23,32 +23,47 @@ static void * const webgpu_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT /* Struct definitions */ -// When registering the backend, we initialize the WebGPU instance. -struct ggml_backend_webgpu_reg_context { +// All the base objects needed to run operations on a WebGPU device +struct webgpu_context_struct { wgpu::Instance instance; + wgpu::Adapter adapter; + wgpu::Device device; + // TODO: initialize + wgpu::Queue queue; + wgpu::Limits limits; + // TODO: initialize + wgpu::ComputePipeline memset_pipeline; +}; + +typedef std::shared_ptr webgpu_context; + +struct ggml_backend_webgpu_reg_context { + webgpu_context webgpu_ctx; + size_t device_count; const char * name; }; -// When getting the (ggml) device, we create a WebGPU adapter and its associated WebGPU device. struct ggml_backend_webgpu_device_context { - // An adapter can only be used to create one device - wgpu::Adapter adapter; - wgpu::Device device; - wgpu::StringView device_name; - wgpu::StringView device_desc; + webgpu_context webgpu_ctx; + + std::string device_name; + std::string device_desc; }; struct ggml_backend_webgpu_context { + webgpu_context webgpu_ctx; + std::string name; - wgpu::Device device; }; struct ggml_backend_webgpu_buffer_context { + webgpu_context webgpu_ctx; + wgpu::Buffer buffer; - ggml_backend_webgpu_buffer_context(wgpu::Buffer buf) : - buffer(buf) { + ggml_backend_webgpu_buffer_context(webgpu_context ctx, wgpu::Buffer buf) : + webgpu_ctx(ctx), buffer(buf) { } }; @@ -88,9 +103,10 @@ static ggml_backend_i ggml_backend_webgpu_i = { /* GGML Backend Buffer Interface */ -// TODO static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { - GGML_UNUSED(buffer); + ggml_backend_webgpu_buffer_context * ctx = static_cast(buffer->context); + ctx->buffer.Destroy(); + delete ctx; } // TODO: what to return here? @@ -99,12 +115,6 @@ static void * ggml_backend_webgpu_buffer_get_base(ggml_backend_buffer_t buffer) return webgpu_ptr_base; } -// TODO -static enum ggml_status ggml_backend_webgpu_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_init_tensor(" << buffer << " (" << buffer->context << "), " << tensor << ")"); - return GGML_STATUS_SUCCESS; -} - // TODO static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")"); @@ -117,7 +127,7 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, // TODO static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - WEBGPU_LOG_DEBUG("ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); } // TODO @@ -138,7 +148,7 @@ static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8 static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = { /* .free_buffer = */ ggml_backend_webgpu_buffer_free_buffer, /* .get_base = */ ggml_backend_webgpu_buffer_get_base, - /* .init_tensor = */ ggml_backend_webgpu_buffer_init_tensor, + /* .init_tensor = */ NULL, // TODO: should we implement this? /* .memset_tensor = */ ggml_backend_webgpu_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_webgpu_buffer_set_tensor, /* .get_tensor = */ ggml_backend_webgpu_buffer_get_tensor, @@ -153,7 +163,7 @@ static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = { static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); - return ctx->device_name.data; + return ctx->device_name.c_str(); } static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { @@ -165,25 +175,21 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b buf_desc.size = size; buf_desc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst; // TODO: error handling - wgpu::Buffer buf = ctx->device.CreateBuffer(&buf_desc); + wgpu::Buffer buf = ctx->webgpu_ctx->device.CreateBuffer(&buf_desc); - ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(buf); + ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf); return ggml_backend_buffer_init(buft, ggml_backend_webgpu_buffer_interface, buf_ctx, size); } static size_t ggml_backend_webgpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); - wgpu::Limits limits; - ctx->device.GetLimits(&limits); - return limits.minStorageBufferOffsetAlignment; + return ctx->webgpu_ctx->limits.minStorageBufferOffsetAlignment; } static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); - wgpu::Limits limits; - ctx->device.GetLimits(&limits); - return limits.maxBufferSize; + return ctx->webgpu_ctx->limits.maxBufferSize; } /* End GGML Backend Buffer Type Interface */ @@ -198,9 +204,7 @@ static const char * ggml_backend_webgpu_host_buffer_type_name(ggml_backend_buffe // WebGPU doesn't specify a memory map alignment like Vulkan, so we use the same value as the storage buffer alignment static size_t ggml_backend_webgpu_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); - wgpu::Limits limits; - ctx->device.GetLimits(&limits); - return limits.minStorageBufferOffsetAlignment; + return ctx->webgpu_ctx->limits.minStorageBufferOffsetAlignment; } /* End GGML Backend Host Buffer Type Interface */ @@ -209,21 +213,19 @@ static size_t ggml_backend_webgpu_host_buffer_type_get_alignment(ggml_backend_bu static const char * ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) { ggml_backend_webgpu_device_context * ctx = static_cast(dev->context); - return ctx->device_name.data; + return ctx->device_name.c_str(); } static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_t dev) { ggml_backend_webgpu_device_context * ctx = static_cast(dev->context); - return ctx->device_desc.data; + return ctx->device_desc.c_str(); } static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { ggml_backend_webgpu_device_context * ctx = static_cast(dev->context); - wgpu::Limits limits; - ctx->device.GetLimits(&limits); // TODO: what do we actually want to return here? - *free = limits.maxBufferSize * WEBGPU_MAX_BUFFERS; - *total = limits.maxBufferSize * WEBGPU_MAX_BUFFERS; + *free = ctx->webgpu_ctx->limits.maxBufferSize * WEBGPU_MAX_BUFFERS; + *total = ctx->webgpu_ctx->limits.maxBufferSize * WEBGPU_MAX_BUFFERS; } static enum ggml_backend_dev_type ggml_backend_webgpu_device_get_type(ggml_backend_dev_t dev) { @@ -258,9 +260,33 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co ggml_backend_webgpu_device_context * dev_ctx = static_cast(dev->context); + wgpu::DeviceDescriptor deviceDescriptor; + deviceDescriptor.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, + [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) { + GGML_UNUSED(device); + GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast(reason), message.data); + }); + deviceDescriptor.SetUncapturedErrorCallback( + [](const wgpu::Device& device, wgpu::ErrorType reason, wgpu::StringView message) { + GGML_UNUSED(device); + GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), message.data); + }); + webgpu_context webgpu_ctx = dev_ctx->webgpu_ctx; + dev_ctx->webgpu_ctx->instance.WaitAny(dev_ctx->webgpu_ctx->adapter.RequestDevice(&deviceDescriptor, wgpu::CallbackMode::WaitAnyOnly, + [webgpu_ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) { + if (status != wgpu::RequestDeviceStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data); + return; + } + webgpu_ctx->device = device; + }), + UINT64_MAX + ); + GGML_ASSERT(dev_ctx->webgpu_ctx->device != nullptr); + static ggml_backend_webgpu_context backend_ctx; - backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + std::string(dev_ctx->device_name.data); - backend_ctx.device = dev_ctx->device; + backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + dev_ctx->device_name; + backend_ctx.webgpu_ctx = dev_ctx->webgpu_ctx; static ggml_backend backend = { /* .guid = */ ggml_backend_webgpu_guid(), @@ -363,7 +389,8 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()"); ggml_backend_webgpu_reg_context * reg_ctx = static_cast(reg->context); - static ggml_backend_webgpu_device_context device_ctx; + + webgpu_context ctx = reg_ctx->webgpu_ctx; wgpu::RequestAdapterOptions options = {}; auto callback = [](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char *message, void *userdata) { @@ -373,38 +400,20 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t } *static_cast(userdata) = adapter; }; - auto callbackMode = wgpu::CallbackMode::WaitAnyOnly; - void *userdata = &device_ctx.adapter; - reg_ctx->instance.WaitAny(reg_ctx->instance.RequestAdapter(&options, callbackMode, callback, userdata), UINT64_MAX); - GGML_ASSERT(device_ctx.adapter != nullptr); + void *userdata = &ctx->adapter; + ctx->instance.WaitAny(ctx->instance.RequestAdapter(&options, wgpu::CallbackMode::WaitAnyOnly, callback, userdata), UINT64_MAX); + GGML_ASSERT(ctx->adapter != nullptr); - wgpu::DeviceDescriptor deviceDescriptor; - deviceDescriptor.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, - [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) { - GGML_UNUSED(device); - GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast(reason), message.data); - }); - deviceDescriptor.SetUncapturedErrorCallback( - [](const wgpu::Device& device, wgpu::ErrorType reason, wgpu::StringView message) { - GGML_UNUSED(device); - GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), message.data); - }); - reg_ctx->instance.WaitAny(device_ctx.adapter.RequestDevice(&deviceDescriptor, callbackMode, - [](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) { - if (status != wgpu::RequestDeviceStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data); - return; - } - device_ctx.device = std::move(device); - }), - UINT64_MAX - ); - GGML_ASSERT(device_ctx.device != nullptr); + ctx->adapter.GetLimits(&ctx->limits); wgpu::AdapterInfo info{}; - device_ctx.adapter.GetInfo(&info); - device_ctx.device_name = info.device; - device_ctx.device_desc = info.description; + ctx->adapter.GetInfo(&info); + + static ggml_backend_webgpu_device_context device_ctx; + device_ctx.webgpu_ctx = ctx; + device_ctx.device_name = std::string(info.device.data); + device_ctx.device_desc = std::string(info.description.data); + GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n", info.vendorID, info.vendor.data, info.architecture.data, info.deviceID, info.device.data, info.description.data); @@ -430,14 +439,18 @@ static const struct ggml_backend_reg_i ggml_backend_webgpu_reg_i = { ggml_backend_reg_t ggml_backend_webgpu_reg() { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg()"); + webgpu_context webgpu_ctx = std::make_shared(); + static ggml_backend_webgpu_reg_context ctx; + ctx.webgpu_ctx = webgpu_ctx; ctx.name = GGML_WEBGPU_NAME; ctx.device_count = 1; + wgpu::InstanceDescriptor instanceDescriptor{}; instanceDescriptor.capabilities.timedWaitAnyEnable = true; - ctx.instance = wgpu::CreateInstance(&instanceDescriptor); - GGML_ASSERT(ctx.instance != nullptr); + webgpu_ctx->instance = wgpu::CreateInstance(&instanceDescriptor); + GGML_ASSERT(webgpu_ctx->instance != nullptr); static ggml_backend_reg reg = { /* .api_version = */ GGML_BACKEND_API_VERSION, From 9e0c6111756e55e43137c9998fb0da5300017c3f Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 4 Jun 2025 17:11:37 -0700 Subject: [PATCH 07/25] Work on memset and add shader loading --- ggml/src/ggml-webgpu/CMakeLists.txt | 38 +++++++++++++++++-- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 11 +++++- .../ggml-webgpu/wgsl-shaders/embed_wgsl.py | 32 ++++++++++++++++ ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl | 38 +++++++++++++++++++ 4 files changed, 113 insertions(+), 6 deletions(-) create mode 100755 ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt index 27d4472ea65fd..1ce496b391afe 100644 --- a/ggml/src/ggml-webgpu/CMakeLists.txt +++ b/ggml/src/ggml-webgpu/CMakeLists.txt @@ -1,15 +1,45 @@ cmake_minimum_required(VERSION 3.13) find_package(Dawn REQUIRED) +find_package(Python3 REQUIRED) + +# Shader locations +set(SHADER_DIR "${CMAKE_CURRENT_SOURCE_DIR}/wgsl-shaders") +set(SHADER_OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/generated") +set(SHADER_HEADER "${SHADER_OUTPUT_DIR}/ggml-wgsl-shaders.hpp") +file(MAKE_DIRECTORY ${SHADER_OUTPUT_DIR}) + +message(STATUS "Shader output dir: ${SHADER_OUTPUT_DIR}") + +# Find all WGSL files +file(GLOB WGSL_SHADER_FILES "${SHADER_DIR}/*.wgsl") + +# Generate the header using a Python script +add_custom_command( + OUTPUT ${SHADER_HEADER} + COMMAND ${CMAKE_COMMAND} -E echo "Embedding WGSL shaders to ggml-wgsl-shaders.hpp" + COMMAND ${CMAKE_COMMAND} -E make_directory ${SHADER_OUTPUT_DIR} + COMMAND ${CMAKE_COMMAND} -E env PYTHONIOENCODING=utf-8 + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/wgsl-shaders/embed_wgsl.py + --input "${SHADER_DIR}" + --output "${SHADER_HEADER}" + DEPENDS ${WGSL_SHADER_FILES} ${CMAKE_CURRENT_SOURCE_DIR}/wgsl-shaders/embed_wgsl.py + VERBATIM +) + +add_custom_target(generate_shaders DEPENDS ${SHADER_HEADER}) ggml_add_backend_library(ggml-webgpu - ggml-webgpu.cpp - ../../include/ggml-webgpu.h - ) + ggml-webgpu.cpp + ${SHADER_HEADER} + ../../include/ggml-webgpu.h +) + +add_dependencies(ggml-webgpu generate_shaders) if (GGML_WEBGPU_DEBUG) target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1) endif() +target_include_directories(ggml-webgpu PRIVATE ${SHADER_OUTPUT_DIR}) target_link_libraries(ggml-webgpu PRIVATE dawn::webgpu_dawn) - diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 6213e694ed326..2b4286b47f57c 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -6,6 +6,8 @@ #include "ggml-impl.h" #include "ggml-backend-impl.h" +#include "ggml-wgsl-shaders.hpp" + #include #include @@ -28,7 +30,6 @@ struct webgpu_context_struct { wgpu::Instance instance; wgpu::Adapter adapter; wgpu::Device device; - // TODO: initialize wgpu::Queue queue; wgpu::Limits limits; // TODO: initialize @@ -260,6 +261,7 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co ggml_backend_webgpu_device_context * dev_ctx = static_cast(dev->context); + // Initialize device wgpu::DeviceDescriptor deviceDescriptor; deviceDescriptor.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) { @@ -272,7 +274,7 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), message.data); }); webgpu_context webgpu_ctx = dev_ctx->webgpu_ctx; - dev_ctx->webgpu_ctx->instance.WaitAny(dev_ctx->webgpu_ctx->adapter.RequestDevice(&deviceDescriptor, wgpu::CallbackMode::WaitAnyOnly, + webgpu_ctx->instance.WaitAny(webgpu_ctx->adapter.RequestDevice(&deviceDescriptor, wgpu::CallbackMode::WaitAnyOnly, [webgpu_ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) { if (status != wgpu::RequestDeviceStatus::Success) { GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data); @@ -284,6 +286,10 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co ); GGML_ASSERT(dev_ctx->webgpu_ctx->device != nullptr); + // Initialize (compute) queue + dev_ctx->webgpu_ctx->queue = dev_ctx->webgpu_ctx->device.GetQueue(); + + static ggml_backend_webgpu_context backend_ctx; backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + dev_ctx->device_name; backend_ctx.webgpu_ctx = dev_ctx->webgpu_ctx; @@ -294,6 +300,7 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co /* .device = */ dev, /* .context = */ &backend_ctx, }; + return &backend; } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py new file mode 100755 index 0000000000000..b3470fef9483f --- /dev/null +++ b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py @@ -0,0 +1,32 @@ +import os +import argparse + +def to_c_array(name, data): + varname = f"wgsl_{name}" + byte_array = ', '.join(f'0x{b:02x}' for b in data) + return f"""\ +const unsigned char {varname}[] = {{ + {byte_array} +}}; +const unsigned int {varname}_len = sizeof({varname}); +""" + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--input', required=True, help='Input directory with .wgsl files') + parser.add_argument('--output', required=True, help='Output .hpp file path') + args = parser.parse_args() + + with open(args.output, 'w', encoding='utf-8') as out: + out.write("// Auto-generated WGSL header\n\n") + for fname in sorted(os.listdir(args.input)): + if fname.endswith('.wgsl'): + path = os.path.join(args.input, fname) + varname = os.path.splitext(fname)[0] + with open(path, 'rb') as f: + data = f.read() + out.write(to_c_array(varname, data)) + out.write('\n') + +if __name__ == '__main__': + main() diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl new file mode 100644 index 0000000000000..7a76640b1ad0f --- /dev/null +++ b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl @@ -0,0 +1,38 @@ +// memset.wgsl +@group(0) @binding(0) +var output_buffer: array; + +struct Params { + offset: u32, // in bytes + size: u32, // in bytes + value: u32, // four identical values +}; + +@group(0) @binding(1) +var params: Params; + +// TODO: figure out workgroup size +@compute @workgroup_size(64) +fn main(@builtin(global_invocation_id) gid: vec3) { + let i = gid.x * 4u; + let start = params.offset; + let end = params.offset + params.size; + + // Each thread writes one u32 (4 bytes) + let byte_index = start + i; + if (byte_index + 4u <= end) { + output_buffer[(byte_index >> 2u)] = params.value; + } else { + // Handle tail (unaligned) + for (var j: u32 = 0u; j < 4u; j = j + 1u) { + let idx = byte_index + j; + if (idx < end) { + let word_idx = idx >> 2u; + let byte_offset = (idx & 3u) * 8u; + let mask = ~(0xffu << byte_offset); + let existing = output_buffer[word_idx]; + output_buffer[word_idx] = (existing & mask) | (params.value & 0xffu) << byte_offset; + } + } + } +} From 520f59562d1d513d6e7b53e94ef4256c5b886bc7 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Thu, 5 Jun 2025 13:47:02 -0700 Subject: [PATCH 08/25] Work on memset polyfill --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 164 ++++++++++++++++-- .../ggml-webgpu/wgsl-shaders/embed_wgsl.py | 37 ++-- ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl | 35 ++-- 3 files changed, 182 insertions(+), 54 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 2b4286b47f57c..849803a3384cf 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -20,9 +20,17 @@ // TODO: find a better way to get the memory available #define WEBGPU_MAX_BUFFERS 32 -// TODO: copied from Vulkan for now, not sure what it's used for +// This is a "fake" base pointer, since WebGPU buffers do not have pointers to their locations. static void * const webgpu_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT +// Always returns the base offset of a tensor, regardless of views. +static uint64_t webgpu_tensor_offset(const ggml_tensor * tensor) { + if (tensor->view_src) { + return (uint8_t *) tensor->view_src->data - (uint8_t *) webgpu_ptr_base; + } + return (uint8_t *) tensor->data - (uint8_t *) webgpu_ptr_base; +} + /* Struct definitions */ // All the base objects needed to run operations on a WebGPU device @@ -32,8 +40,12 @@ struct webgpu_context_struct { wgpu::Device device; wgpu::Queue queue; wgpu::Limits limits; - // TODO: initialize + + // memset pipeline and parameter buffers wgpu::ComputePipeline memset_pipeline; + wgpu::Buffer memset_params_dev_buf; + wgpu::Buffer memset_params_host_buf; + size_t memset_elems_per_thread; }; typedef std::shared_ptr webgpu_context; @@ -70,6 +82,40 @@ struct ggml_backend_webgpu_buffer_context { /* End struct definitions */ +/* WebGPU object initializations */ + +static void ggml_webgpu_create_pipeline(wgpu::Device &device, wgpu::ComputePipeline &pipeline, const char * shader_code, const std::vector &constants = {}) { + WEBGPU_LOG_DEBUG("ggml_webgpu_create_pipeline()"); + wgpu::ShaderSourceWGSL shader_source; + shader_source.code = shader_code; + wgpu::ShaderModuleDescriptor shader_desc; + shader_desc.nextInChain = &shader_source; + wgpu::ShaderModule shader_module = device.CreateShaderModule(&shader_desc); + + wgpu::ComputePipelineDescriptor pipeline_desc; + pipeline_desc.compute.module = shader_module; + pipeline_desc.compute.entryPoint = "main"; // Entry point in the WGSL code + pipeline_desc.layout = nullptr; // Guessing that nullptr means auto layout + if (constants.size() > 0) { + pipeline_desc.compute.constants = constants.data(); + pipeline_desc.compute.constantCount = constants.size(); + } + pipeline = device.CreateComputePipeline(&pipeline_desc); +} + +static void ggml_webgpu_create_buffer(wgpu::Device &device, wgpu::Buffer &buffer, size_t size, wgpu::BufferUsage usage) { + WEBGPU_LOG_DEBUG("ggml_webgpu_create_buffer()"); + + wgpu::BufferDescriptor buffer_desc; + buffer_desc.size = size; + buffer_desc.usage = usage; + buffer_desc.mappedAtCreation = false; + // TODO: error handling + buffer = device.CreateBuffer(&buffer_desc); +} + +/** End WebGPU object initializations */ + /** GGML Backend Interface */ static const char * ggml_backend_webgpu_name(ggml_backend_t backend) { @@ -110,15 +156,77 @@ static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) delete ctx; } -// TODO: what to return here? +// Returns the "fake" base pointer. static void * ggml_backend_webgpu_buffer_get_base(ggml_backend_buffer_t buffer) { GGML_UNUSED(buffer); return webgpu_ptr_base; } -// TODO static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + if (size == 0) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor: size is zero, nothing to do."); + return; + } WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")"); + ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; + webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx; + wgpu::Device device = webgpu_ctx->device; + + // map the host parameters buffer + webgpu_ctx->instance.WaitAny(webgpu_ctx->memset_params_host_buf.MapAsync( + wgpu::MapMode::Write, 0, webgpu_ctx->memset_params_host_buf.GetSize(), wgpu::CallbackMode::WaitAnyOnly, + [](wgpu::MapAsyncStatus status, wgpu::StringView message) { + if (status != wgpu::MapAsyncStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n", message.data); + } + }), + UINT64_MAX + ); + + // Set the host parameter buffer + uint32_t * params = (uint32_t *)webgpu_ctx->memset_params_host_buf.GetMappedRange(); + size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; + uint32_t val32 = (uint32_t)value * 0x01010101; + params[0] = (uint32_t)total_offset; + params[1] = (uint32_t)size; + params[2] = val32; + webgpu_ctx->memset_params_host_buf.Unmap(); + + // buffer to memset + wgpu::Buffer buf = buf_ctx->buffer; + + wgpu::BindGroupEntry entries[2]; + entries[0].binding = 0; // binding for the buffer to memset + entries[0].buffer = buf; + entries[0].offset = 0; + entries[0].size = buf.GetSize(); + entries[1].binding = 1; // binding for the parameters + entries[1].buffer = webgpu_ctx->memset_params_dev_buf; + entries[1].offset = 0; + entries[1].size = webgpu_ctx->memset_params_dev_buf.GetSize(); + + wgpu::BindGroupDescriptor bind_group_desc; + bind_group_desc.layout = webgpu_ctx->memset_pipeline.GetBindGroupLayout(0); + bind_group_desc.entryCount = 2; + bind_group_desc.entries = entries; + wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc); + + wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); + encoder.CopyBufferToBuffer( + webgpu_ctx->memset_params_host_buf, 0, + webgpu_ctx->memset_params_dev_buf, 0, + webgpu_ctx->memset_params_dev_buf.GetSize() + ); + wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); + pass.SetPipeline(webgpu_ctx->memset_pipeline); + pass.SetBindGroup(0, bind_group); + size_t elems_per_wg = webgpu_ctx->limits.maxComputeWorkgroupSizeX * webgpu_ctx->memset_elems_per_thread; + pass.DispatchWorkgroups((((size + 3)/4) + elems_per_wg - 1) / elems_per_wg, 1, 1); + pass.End(); + wgpu::CommandBuffer commands = encoder.Finish(); + + // async, do we need to wait on this? + webgpu_ctx->queue.Submit(1, &commands); } // TODO @@ -171,12 +279,9 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_alloc_buffer(" << size << ")"); ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); - wgpu::BufferDescriptor buf_desc; - buf_desc.mappedAtCreation = false; - buf_desc.size = size; - buf_desc.usage = wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst; - // TODO: error handling - wgpu::Buffer buf = ctx->webgpu_ctx->device.CreateBuffer(&buf_desc); + wgpu::Buffer buf; + ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf, size, + wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst); ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf); @@ -188,9 +293,10 @@ static size_t ggml_backend_webgpu_buffer_type_get_alignment(ggml_backend_buffer_ return ctx->webgpu_ctx->limits.minStorageBufferOffsetAlignment; } +// maxBufferSize might be larger, but you can't bind more than maxStorageBufferBindingSize to a single binding. static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); - return ctx->webgpu_ctx->limits.maxBufferSize; + return ctx->webgpu_ctx->limits.maxStorageBufferBindingSize; } /* End GGML Backend Buffer Type Interface */ @@ -252,8 +358,26 @@ static ggml_guid_t ggml_backend_webgpu_guid(void) { return reinterpret_cast((void *)guid_str); } +static void ggml_webgpu_init_memset_pipeline(webgpu_context webgpu_ctx) { + // we use the maximum workgroup size for the memset pipeline + size_t max_wg_size = webgpu_ctx->limits.maxComputeWorkgroupSizeX; + size_t max_threads = max_wg_size * webgpu_ctx->limits.maxComputeWorkgroupsPerDimension; + // Size the elems_per_thread so that the largest buffer size can be handled + webgpu_ctx->memset_elems_per_thread = (webgpu_ctx->limits.maxStorageBufferBindingSize / sizeof(uint32_t) + max_threads - 1) / max_threads; + std::vector constants(2); + constants[0].key = "wg_size"; + constants[0].value = max_wg_size; + constants[1].key = "elems_per_thread"; + constants[1].value = webgpu_ctx->memset_elems_per_thread; + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->memset_pipeline, wgsl_memset, constants); + ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_dev_buf, + 3 * sizeof(uint32_t), // 3 parameters: buffer size, offset, value + wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst); + ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_host_buf, + 3 * sizeof(uint32_t), wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc); +} + // TODO: Does this need to be thread safe? Is it only called once? -// Implementation in GGML Backend Interface section static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) { GGML_UNUSED(params); @@ -284,16 +408,18 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co }), UINT64_MAX ); - GGML_ASSERT(dev_ctx->webgpu_ctx->device != nullptr); + GGML_ASSERT(webgpu_ctx->device != nullptr); // Initialize (compute) queue - dev_ctx->webgpu_ctx->queue = dev_ctx->webgpu_ctx->device.GetQueue(); - + webgpu_ctx->queue = webgpu_ctx->device.GetQueue(); + + ggml_webgpu_init_memset_pipeline(webgpu_ctx); static ggml_backend_webgpu_context backend_ctx; backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + dev_ctx->device_name; - backend_ctx.webgpu_ctx = dev_ctx->webgpu_ctx; + backend_ctx.webgpu_ctx = webgpu_ctx; + // See GGML Backend Interface section static ggml_backend backend = { /* .guid = */ ggml_backend_webgpu_guid(), /* .interface = */ ggml_backend_webgpu_i, @@ -304,8 +430,8 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co return &backend; } -// Implementation in GGML Backend Buffer Type Interface section static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggml_backend_dev_t dev) { + // See GGML Backend Buffer Type Interface section static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type = { /* .iface = */ { /* .get_name = */ ggml_backend_webgpu_buffer_type_get_name, @@ -323,8 +449,8 @@ static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggm } -// Implementation in GGML Backend Host Buffer Type Interface static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_host_buffer_type(ggml_backend_dev_t dev) { + // See GGML Backend Host Buffer Type Interface section static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type_host = { /* .iface = */ { /* .get_name = */ ggml_backend_webgpu_host_buffer_type_name, @@ -390,7 +516,6 @@ static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) { // TODO: Does this need to be thread safe? Is it only called once? // Only one device is supported for now -// Implementation in GGML Backend Device Interface section static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t reg, size_t index) { GGML_ASSERT(index == 0); WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()"); @@ -424,6 +549,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n", info.vendorID, info.vendor.data, info.architecture.data, info.deviceID, info.device.data, info.description.data); + // See GGML Backend Device Interface section static ggml_backend_device device = { /* .iface = */ ggml_backend_webgpu_device_i, /* .reg = */ reg, diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py index b3470fef9483f..daec8fe87dfda 100755 --- a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +++ b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py @@ -1,32 +1,31 @@ import os import argparse -def to_c_array(name, data): - varname = f"wgsl_{name}" - byte_array = ', '.join(f'0x{b:02x}' for b in data) - return f"""\ -const unsigned char {varname}[] = {{ - {byte_array} -}}; -const unsigned int {varname}_len = sizeof({varname}); -""" +def escape_triple_quotes(wgsl): + # Simple defense in case of embedded """ + return wgsl.replace('"""', '\\"""') + +def to_cpp_string_literal(varname, content): + return f'const char* wgsl_{varname} = R"({content})";\n' def main(): parser = argparse.ArgumentParser() - parser.add_argument('--input', required=True, help='Input directory with .wgsl files') - parser.add_argument('--output', required=True, help='Output .hpp file path') + parser.add_argument('--input', required=True) + parser.add_argument('--output', required=True) args = parser.parse_args() with open(args.output, 'w', encoding='utf-8') as out: - out.write("// Auto-generated WGSL header\n\n") + out.write("// Auto-generated shader embedding \n\n") for fname in sorted(os.listdir(args.input)): - if fname.endswith('.wgsl'): - path = os.path.join(args.input, fname) - varname = os.path.splitext(fname)[0] - with open(path, 'rb') as f: - data = f.read() - out.write(to_c_array(varname, data)) - out.write('\n') + if not fname.endswith('.wgsl'): + continue + shader_path = os.path.join(args.input, fname) + varname = os.path.splitext(fname)[0] + with open(shader_path, 'r', encoding='utf-8') as f: + content = f.read() + content = escape_triple_quotes(content) + out.write(to_cpp_string_literal(varname, content)) + out.write('\n') if __name__ == '__main__': main() diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl index 7a76640b1ad0f..32c0b2b67d5a2 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl @@ -11,27 +11,30 @@ struct Params { @group(0) @binding(1) var params: Params; -// TODO: figure out workgroup size -@compute @workgroup_size(64) +override wg_size: u32; +override elems_per_thread: u32; + +@compute @workgroup_size(wg_size) fn main(@builtin(global_invocation_id) gid: vec3) { let i = gid.x * 4u; let start = params.offset; let end = params.offset + params.size; - // Each thread writes one u32 (4 bytes) - let byte_index = start + i; - if (byte_index + 4u <= end) { - output_buffer[(byte_index >> 2u)] = params.value; - } else { - // Handle tail (unaligned) - for (var j: u32 = 0u; j < 4u; j = j + 1u) { - let idx = byte_index + j; - if (idx < end) { - let word_idx = idx >> 2u; - let byte_offset = (idx & 3u) * 8u; - let mask = ~(0xffu << byte_offset); - let existing = output_buffer[word_idx]; - output_buffer[word_idx] = (existing & mask) | (params.value & 0xffu) << byte_offset; + for (var j: u32 = 0u; j < elems_per_thread; j = j + 1u) { + let byte_index = start + i + j; + if (byte_index + 4u <= end) { + output_buffer[(byte_index >> 2u)] = params.value; + } else { + // Handle tail (unaligned) + for (var k: u32 = 0u; k < 4u; k = k + 1u) { + let idx = byte_index + k; + if (idx < end) { + let word_idx = idx >> 2u; + let byte_offset = (idx & 3u) * 8u; + let mask = ~(0xffu << byte_offset); + let existing = output_buffer[word_idx]; + output_buffer[word_idx] = (existing & mask) | ((params.value & 0xffu) << byte_offset); + } } } } From 2d24a8ada1b08d1d3aa9548fb1e9f3e2b3c3250f Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Sun, 8 Jun 2025 20:26:37 -0700 Subject: [PATCH 09/25] Implement set_tensor as webgpu WriteBuffer, remove host_buffer stubs since webgpu doesn't support it --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 50 ++++++---------------------- 1 file changed, 11 insertions(+), 39 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 849803a3384cf..885a1ab9bcb98 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -225,16 +225,22 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe pass.End(); wgpu::CommandBuffer commands = encoder.Finish(); - // async, do we need to wait on this? + // TODO, async, do we need to wait on this? webgpu_ctx->queue.Submit(1, &commands); } -// TODO static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); + ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; + webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx; + + size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; + + // TODO: wait on this? + webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, size); } -// TODO +// TODO: we need a staging buffer for this, since WebGPU does not allow reading from storage buffers directly. static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); } @@ -301,21 +307,6 @@ static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_t /* End GGML Backend Buffer Type Interface */ -/* GGML Backend Host Buffer Type Interface */ - -static const char * ggml_backend_webgpu_host_buffer_type_name(ggml_backend_buffer_type_t buft) { - GGML_UNUSED(buft); - return GGML_WEBGPU_NAME "_Host"; -} - -// WebGPU doesn't specify a memory map alignment like Vulkan, so we use the same value as the storage buffer alignment -static size_t ggml_backend_webgpu_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); - return ctx->webgpu_ctx->limits.minStorageBufferOffsetAlignment; -} - -/* End GGML Backend Host Buffer Type Interface */ - /* GGML Backend Device Interface */ static const char * ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) { @@ -347,7 +338,7 @@ static void ggml_backend_webgpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_webgpu_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = { /* .async = */ false, - /* .host_buffer = */ true, // maybe? not sure what this means yet + /* .host_buffer = */ false, /* .buffer_from_host_ptr = */ false, /* .events = */ false, }; @@ -448,25 +439,6 @@ static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggm return &ggml_backend_webgpu_buffer_type; } - -static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_host_buffer_type(ggml_backend_dev_t dev) { - // See GGML Backend Host Buffer Type Interface section - static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type_host = { - /* .iface = */ { - /* .get_name = */ ggml_backend_webgpu_host_buffer_type_name, - /* .alloc_buffer = */ NULL, // TODO - /* .get_alignment = */ ggml_backend_webgpu_host_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, - }, - /* .device = */ dev, - /* .context = */ NULL, - }; - - return &ggml_backend_webgpu_buffer_type_host; -} - static bool ggml_backend_webgpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { GGML_UNUSED(dev); return buft->iface.get_name == ggml_backend_webgpu_buffer_type_get_name; @@ -490,7 +462,7 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = { /* .get_props = */ ggml_backend_webgpu_device_get_props, /* .init_backend = */ ggml_backend_webgpu_device_init, /* .get_buffer_type = */ ggml_backend_webgpu_device_get_buffer_type, - /* .get_host_buffer_type = */ ggml_backend_webgpu_device_get_host_buffer_type, + /* .get_host_buffer_type = */ NULL, /* .buffer_from_host_ptr = */ NULL, /* .supports_op = */ ggml_backend_webgpu_device_supports_op, /* .supports_buft = */ ggml_backend_webgpu_device_supports_buft, From d0480cae1d07469263ab8d3e7d8c258e112c4799 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 9 Jun 2025 12:55:12 -0700 Subject: [PATCH 10/25] Implement get_tensor and buffer_clear --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 176 ++++++++++++++++----------- 1 file changed, 107 insertions(+), 69 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 885a1ab9bcb98..a0e578751cc10 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -46,6 +46,9 @@ struct webgpu_context_struct { wgpu::Buffer memset_params_dev_buf; wgpu::Buffer memset_params_host_buf; size_t memset_elems_per_thread; + + // Staging buffer for reading data from the GPU + wgpu::Buffer get_tensor_staging_buf; }; typedef std::shared_ptr webgpu_context; @@ -116,6 +119,71 @@ static void ggml_webgpu_create_buffer(wgpu::Device &device, wgpu::Buffer &buffer /** End WebGPU object initializations */ +/** WebGPU Actions */ + +static void * ggml_backend_webgpu_map_buffer(webgpu_context ctx, wgpu::Buffer buffer, wgpu::MapMode mode, size_t offset, size_t size) { + ctx->instance.WaitAny(buffer.MapAsync( + mode, offset, size, wgpu::CallbackMode::WaitAnyOnly, + [](wgpu::MapAsyncStatus status, wgpu::StringView message) { + if (status != wgpu::MapAsyncStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n", message.data); + } + }), + UINT64_MAX + ); + return buffer.GetMappedRange(); +} + +static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer buf, uint8_t value, size_t offset, size_t size) { + wgpu::Device device = ctx->device; + + // map the host parameters buffer + uint32_t * params = (uint32_t *)ggml_backend_webgpu_map_buffer(ctx, ctx->memset_params_host_buf, + wgpu::MapMode::Write, 0, ctx->memset_params_host_buf.GetSize()); + + // This is a trick to set all bytes of a u32 to the same 1 byte value. + uint32_t val32 = (uint32_t)value * 0x01010101; + params[0] = (uint32_t)offset; + params[1] = (uint32_t)size; + params[2] = val32; + ctx->memset_params_host_buf.Unmap(); + + wgpu::BindGroupEntry entries[2]; + entries[0].binding = 0; // binding for the buffer to memset + entries[0].buffer = buf; + entries[0].offset = 0; + entries[0].size = buf.GetSize(); + entries[1].binding = 1; // binding for the parameters + entries[1].buffer = ctx->memset_params_dev_buf; + entries[1].offset = 0; + entries[1].size = ctx->memset_params_dev_buf.GetSize(); + + wgpu::BindGroupDescriptor bind_group_desc; + bind_group_desc.layout = ctx->memset_pipeline.GetBindGroupLayout(0); + bind_group_desc.entryCount = 2; + bind_group_desc.entries = entries; + wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc); + + wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); + encoder.CopyBufferToBuffer( + ctx->memset_params_host_buf, 0, + ctx->memset_params_dev_buf, 0, + ctx->memset_params_dev_buf.GetSize() + ); + wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); + pass.SetPipeline(ctx->memset_pipeline); + pass.SetBindGroup(0, bind_group); + size_t elems_per_wg = ctx->limits.maxComputeWorkgroupSizeX * ctx->memset_elems_per_thread; + pass.DispatchWorkgroups((((size + 3)/4) + elems_per_wg - 1) / elems_per_wg, 1, 1); + pass.End(); + wgpu::CommandBuffer commands = encoder.Finish(); + + // TODO, async, do we need to wait on this? + ctx->queue.Submit(1, &commands); +} + +/** End WebGPU Actions */ + /** GGML Backend Interface */ static const char * ggml_backend_webgpu_name(ggml_backend_t backend) { @@ -167,66 +235,12 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor: size is zero, nothing to do."); return; } - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")"); - ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; - webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx; - wgpu::Device device = webgpu_ctx->device; - // map the host parameters buffer - webgpu_ctx->instance.WaitAny(webgpu_ctx->memset_params_host_buf.MapAsync( - wgpu::MapMode::Write, 0, webgpu_ctx->memset_params_host_buf.GetSize(), wgpu::CallbackMode::WaitAnyOnly, - [](wgpu::MapAsyncStatus status, wgpu::StringView message) { - if (status != wgpu::MapAsyncStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n", message.data); - } - }), - UINT64_MAX - ); + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buffer << ", " << tensor << ", " << value << ", " << offset << ", " << size << ")"); - // Set the host parameter buffer - uint32_t * params = (uint32_t *)webgpu_ctx->memset_params_host_buf.GetMappedRange(); + ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; - uint32_t val32 = (uint32_t)value * 0x01010101; - params[0] = (uint32_t)total_offset; - params[1] = (uint32_t)size; - params[2] = val32; - webgpu_ctx->memset_params_host_buf.Unmap(); - - // buffer to memset - wgpu::Buffer buf = buf_ctx->buffer; - - wgpu::BindGroupEntry entries[2]; - entries[0].binding = 0; // binding for the buffer to memset - entries[0].buffer = buf; - entries[0].offset = 0; - entries[0].size = buf.GetSize(); - entries[1].binding = 1; // binding for the parameters - entries[1].buffer = webgpu_ctx->memset_params_dev_buf; - entries[1].offset = 0; - entries[1].size = webgpu_ctx->memset_params_dev_buf.GetSize(); - - wgpu::BindGroupDescriptor bind_group_desc; - bind_group_desc.layout = webgpu_ctx->memset_pipeline.GetBindGroupLayout(0); - bind_group_desc.entryCount = 2; - bind_group_desc.entries = entries; - wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc); - - wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); - encoder.CopyBufferToBuffer( - webgpu_ctx->memset_params_host_buf, 0, - webgpu_ctx->memset_params_dev_buf, 0, - webgpu_ctx->memset_params_dev_buf.GetSize() - ); - wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); - pass.SetPipeline(webgpu_ctx->memset_pipeline); - pass.SetBindGroup(0, bind_group); - size_t elems_per_wg = webgpu_ctx->limits.maxComputeWorkgroupSizeX * webgpu_ctx->memset_elems_per_thread; - pass.DispatchWorkgroups((((size + 3)/4) + elems_per_wg - 1) / elems_per_wg, 1, 1); - pass.End(); - wgpu::CommandBuffer commands = encoder.Finish(); - - // TODO, async, do we need to wait on this? - webgpu_ctx->queue.Submit(1, &commands); + ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, total_offset, size); } static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { @@ -243,33 +257,57 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, // TODO: we need a staging buffer for this, since WebGPU does not allow reading from storage buffers directly. static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); -} -// TODO -static bool ggml_backend_webgpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { - GGML_UNUSED(buffer); - GGML_UNUSED(src); - GGML_UNUSED(dst); + ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; + webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx; + wgpu::Device device = webgpu_ctx->device; - return true; + size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; + + if (webgpu_ctx->get_tensor_staging_buf == nullptr || + webgpu_ctx->get_tensor_staging_buf.GetSize() < size) { + // Create a new staging buffer if it doesn't exist or is too small + if (webgpu_ctx->get_tensor_staging_buf) { + webgpu_ctx->get_tensor_staging_buf.Destroy(); + } + ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, size, + wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead); + } + + // Copy the data from the buffer to the staging buffer + wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); + encoder.CopyBufferToBuffer(buf_ctx->buffer, total_offset, webgpu_ctx->get_tensor_staging_buf, 0, size); + wgpu::CommandBuffer commands = encoder.Finish(); + // Submit the command buffer to the queue + webgpu_ctx->queue.Submit(1, &commands); + + // Map the staging buffer to read the data + const void * mapped_range = ggml_backend_webgpu_map_buffer(webgpu_ctx, webgpu_ctx->get_tensor_staging_buf, + wgpu::MapMode::Read, 0, size); + + // Copy the data from the mapped range to the output buffer + std::memcpy(data, mapped_range, size); + webgpu_ctx->get_tensor_staging_buf.Unmap(); } // TODO static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - GGML_UNUSED(buffer); - GGML_UNUSED(value); + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << value << ")"); + + ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; + ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, 0, buf_ctx->buffer.GetSize()); } static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = { /* .free_buffer = */ ggml_backend_webgpu_buffer_free_buffer, /* .get_base = */ ggml_backend_webgpu_buffer_get_base, - /* .init_tensor = */ NULL, // TODO: should we implement this? + /* .init_tensor = */ NULL, // TODO: optional, needed? /* .memset_tensor = */ ggml_backend_webgpu_buffer_memset_tensor, /* .set_tensor = */ ggml_backend_webgpu_buffer_set_tensor, /* .get_tensor = */ ggml_backend_webgpu_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_webgpu_buffer_cpy_tensor, + /* .cpy_tensor = */ NULL, // TODO: optional, implement this /* .clear = */ ggml_backend_webgpu_buffer_clear, - /* .reset = */ NULL, + /* .reset = */ NULL, // TODO: optional, think it coordinates with .init_tensor }; /* End GGML Backend Buffer Interface */ From f8a53eee3b45dacd8a1de74845edde99b57dd929 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 10 Jun 2025 17:18:11 -0700 Subject: [PATCH 11/25] Finish rest of setup --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index a0e578751cc10..ef3349d515171 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -187,6 +187,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b /** GGML Backend Interface */ static const char * ggml_backend_webgpu_name(ggml_backend_t backend) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_name()"); ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context; return ctx->name.c_str(); } @@ -219,6 +220,7 @@ static ggml_backend_i ggml_backend_webgpu_i = { /* GGML Backend Buffer Interface */ static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_free_buffer()"); ggml_backend_webgpu_buffer_context * ctx = static_cast(buffer->context); ctx->buffer.Destroy(); delete ctx; @@ -226,6 +228,7 @@ static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) // Returns the "fake" base pointer. static void * ggml_backend_webgpu_buffer_get_base(ggml_backend_buffer_t buffer) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_base()"); GGML_UNUSED(buffer); return webgpu_ptr_base; } @@ -315,6 +318,7 @@ static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = { /* GGML Backend Buffer Type Interface */ static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_get_name()"); ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); return ctx->device_name.c_str(); } @@ -333,12 +337,14 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b } static size_t ggml_backend_webgpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_get_alignment()"); ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); return ctx->webgpu_ctx->limits.minStorageBufferOffsetAlignment; } // maxBufferSize might be larger, but you can't bind more than maxStorageBufferBindingSize to a single binding. static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_get_max_size()"); ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); return ctx->webgpu_ctx->limits.maxStorageBufferBindingSize; } @@ -348,16 +354,20 @@ static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_t /* GGML Backend Device Interface */ static const char * ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_get_name()"); ggml_backend_webgpu_device_context * ctx = static_cast(dev->context); return ctx->device_name.c_str(); } static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_t dev) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_get_description()"); ggml_backend_webgpu_device_context * ctx = static_cast(dev->context); return ctx->device_desc.c_str(); } static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_get_memory()"); + ggml_backend_webgpu_device_context * ctx = static_cast(dev->context); // TODO: what do we actually want to return here? *free = ctx->webgpu_ctx->limits.maxBufferSize * WEBGPU_MAX_BUFFERS; @@ -365,6 +375,7 @@ static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t } static enum ggml_backend_dev_type ggml_backend_webgpu_device_get_type(ggml_backend_dev_t dev) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_get_type()"); GGML_UNUSED(dev); return GGML_BACKEND_DEVICE_TYPE_GPU; } @@ -413,21 +424,22 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_init()"); ggml_backend_webgpu_device_context * dev_ctx = static_cast(dev->context); + webgpu_context webgpu_ctx = dev_ctx->webgpu_ctx; // Initialize device - wgpu::DeviceDescriptor deviceDescriptor; - deviceDescriptor.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, + wgpu::DeviceDescriptor dev_desc; + dev_desc.requiredLimits = &webgpu_ctx->limits; + dev_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) { GGML_UNUSED(device); GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast(reason), message.data); }); - deviceDescriptor.SetUncapturedErrorCallback( + dev_desc.SetUncapturedErrorCallback( [](const wgpu::Device& device, wgpu::ErrorType reason, wgpu::StringView message) { GGML_UNUSED(device); GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), message.data); }); - webgpu_context webgpu_ctx = dev_ctx->webgpu_ctx; - webgpu_ctx->instance.WaitAny(webgpu_ctx->adapter.RequestDevice(&deviceDescriptor, wgpu::CallbackMode::WaitAnyOnly, + webgpu_ctx->instance.WaitAny(webgpu_ctx->adapter.RequestDevice(&dev_desc, wgpu::CallbackMode::WaitAnyOnly, [webgpu_ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) { if (status != wgpu::RequestDeviceStatus::Success) { GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data); @@ -515,11 +527,13 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = { /* GGML Backend Registration Interface */ static const char * ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg_get_name()"); ggml_backend_webgpu_reg_context * ctx = static_cast(reg->context); return ctx->name; } static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg_get_device_count()"); ggml_backend_webgpu_reg_context * ctx = static_cast(reg->context); return ctx->device_count; } From 39d956d1eaf1cc025467278983cfea518627f3a8 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 11 Jun 2025 11:14:57 -0700 Subject: [PATCH 12/25] Start work on compute graph --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 47 +++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index ef3349d515171..3e0dde29ad379 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -199,6 +199,39 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) { // TODO: cleanup } +// Returns true if node has enqueued work into the queue, false otherwise +static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ + if (ggml_is_empty(node)) { + return false; + } + + WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")"); + + switch (node->op) { + // no-op + case GGML_OP_NONE: + // these next four ops modify the logical view of the tensor, but do not change its data + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + return false; + default: + return false; + } +} + +static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_graph_compute(" << cgraph->n_nodes << " nodes)"); + + ggml_backend_webgpu_context * backend_ctx = static_cast(backend->context); + webgpu_context ctx = backend_ctx->webgpu_ctx; + + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_webgpu_encode_node(ctx, cgraph->nodes[i]); + } +} + static ggml_backend_i ggml_backend_webgpu_i = { /* .get_name = */ ggml_backend_webgpu_name, /* .free = */ ggml_backend_webgpu_free, @@ -209,8 +242,8 @@ static ggml_backend_i ggml_backend_webgpu_i = { /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, // TODO - /* .graph_compute = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_webgpu_graph_compute, /* .event_record = */ NULL, /* .event_wait = */ NULL, }; @@ -228,7 +261,6 @@ static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) // Returns the "fake" base pointer. static void * ggml_backend_webgpu_buffer_get_base(ggml_backend_buffer_t buffer) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_base()"); GGML_UNUSED(buffer); return webgpu_ptr_base; } @@ -354,13 +386,11 @@ static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_t /* GGML Backend Device Interface */ static const char * ggml_backend_webgpu_device_get_name(ggml_backend_dev_t dev) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_get_name()"); ggml_backend_webgpu_device_context * ctx = static_cast(dev->context); return ctx->device_name.c_str(); } static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_t dev) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_get_description()"); ggml_backend_webgpu_device_context * ctx = static_cast(dev->context); return ctx->device_desc.c_str(); } @@ -499,6 +529,13 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const // what should we support first? switch (op->op) { + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + return true; + default: return false; } From d036f10016e6f8ade8f72bb02c2e8bc6793c3321 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Thu, 12 Jun 2025 14:33:58 -0700 Subject: [PATCH 13/25] Basic mat mul working --- ggml/include/ggml-webgpu.h | 3 + ggml/src/ggml-webgpu/ggml-webgpu.cpp | 113 ++++++++++++++++-- .../src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl | 25 ++++ 3 files changed, 132 insertions(+), 9 deletions(-) create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl diff --git a/ggml/include/ggml-webgpu.h b/ggml/include/ggml-webgpu.h index cf6cb451648e9..65b8ed9bb6644 100644 --- a/ggml/include/ggml-webgpu.h +++ b/ggml/include/ggml-webgpu.h @@ -9,6 +9,9 @@ extern "C" { #define GGML_WEBGPU_NAME "WebGPU" +// Needed for examples in ggml +GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void); + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void); #ifdef __cplusplus diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 3e0dde29ad379..d0e146385b545 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -41,10 +41,15 @@ struct webgpu_context_struct { wgpu::Queue queue; wgpu::Limits limits; - // memset pipeline and parameter buffers + // pipelines and parameter buffers + // TODO: reuse params buffers for different pipelines when possible wgpu::ComputePipeline memset_pipeline; wgpu::Buffer memset_params_dev_buf; wgpu::Buffer memset_params_host_buf; + wgpu::ComputePipeline mul_mat_pipeline; + wgpu::Buffer mul_mat_params_dev_buf; + wgpu::Buffer mul_mat_params_host_buf; + size_t memset_elems_per_thread; // Staging buffer for reading data from the GPU @@ -87,7 +92,7 @@ struct ggml_backend_webgpu_buffer_context { /* WebGPU object initializations */ -static void ggml_webgpu_create_pipeline(wgpu::Device &device, wgpu::ComputePipeline &pipeline, const char * shader_code, const std::vector &constants = {}) { +static void ggml_webgpu_create_pipeline(wgpu::Device &device, wgpu::ComputePipeline &pipeline, const char * shader_code, const char * label, const std::vector &constants = {}) { WEBGPU_LOG_DEBUG("ggml_webgpu_create_pipeline()"); wgpu::ShaderSourceWGSL shader_source; shader_source.code = shader_code; @@ -96,6 +101,7 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &device, wgpu::ComputePipel wgpu::ShaderModule shader_module = device.CreateShaderModule(&shader_desc); wgpu::ComputePipelineDescriptor pipeline_desc; + pipeline_desc.label = label; pipeline_desc.compute.module = shader_module; pipeline_desc.compute.entryPoint = "main"; // Entry point in the WGSL code pipeline_desc.layout = nullptr; // Guessing that nullptr means auto layout @@ -121,7 +127,7 @@ static void ggml_webgpu_create_buffer(wgpu::Device &device, wgpu::Buffer &buffer /** WebGPU Actions */ -static void * ggml_backend_webgpu_map_buffer(webgpu_context ctx, wgpu::Buffer buffer, wgpu::MapMode mode, size_t offset, size_t size) { +static void ggml_backend_webgpu_map_buffer(webgpu_context ctx, wgpu::Buffer buffer, wgpu::MapMode mode, size_t offset, size_t size) { ctx->instance.WaitAny(buffer.MapAsync( mode, offset, size, wgpu::CallbackMode::WaitAnyOnly, [](wgpu::MapAsyncStatus status, wgpu::StringView message) { @@ -131,15 +137,14 @@ static void * ggml_backend_webgpu_map_buffer(webgpu_context ctx, wgpu::Buffer bu }), UINT64_MAX ); - return buffer.GetMappedRange(); } static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer buf, uint8_t value, size_t offset, size_t size) { wgpu::Device device = ctx->device; // map the host parameters buffer - uint32_t * params = (uint32_t *)ggml_backend_webgpu_map_buffer(ctx, ctx->memset_params_host_buf, - wgpu::MapMode::Write, 0, ctx->memset_params_host_buf.GetSize()); + ggml_backend_webgpu_map_buffer(ctx, ctx->memset_params_host_buf, wgpu::MapMode::Write, 0, ctx->memset_params_host_buf.GetSize()); + uint32_t * params = (uint32_t *) ctx->memset_params_host_buf.GetMappedRange(); // This is a trick to set all bytes of a u32 to the same 1 byte value. uint32_t val32 = (uint32_t)value * 0x01010101; @@ -207,6 +212,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")"); + switch (node->op) { // no-op case GGML_OP_NONE: @@ -216,6 +222,76 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: return false; + + // basic matrix multiplication for now, 2d tensors only + case GGML_OP_MUL_MAT: { + const ggml_tensor * src0 = node->src[0]; + ggml_backend_webgpu_buffer_context * src0_ctx = (ggml_backend_webgpu_buffer_context *) src0->buffer->context; + size_t src0_offset = webgpu_tensor_offset(src0) + src0->view_offs; + const ggml_tensor * src1 = node->src[1]; + ggml_backend_webgpu_buffer_context * src1_ctx = (ggml_backend_webgpu_buffer_context *) src1->buffer->context; + size_t src1_offset = webgpu_tensor_offset(src1) + src1->view_offs; + ggml_backend_webgpu_buffer_context * dst_ctx = (ggml_backend_webgpu_buffer_context *) node->buffer->context; + + size_t dst_offset = webgpu_tensor_offset(node) + node->view_offs; + + wgpu::Device device = ctx->device; + + // map the host parameters buffer + ggml_backend_webgpu_map_buffer(ctx, ctx->mul_mat_params_host_buf, + wgpu::MapMode::Write, 0, ctx->mul_mat_params_host_buf.GetSize()); + uint32_t * params = (uint32_t *) ctx->mul_mat_params_host_buf.GetMappedRange(); + + params[0] = (uint32_t)node->ne[1]; // number of rows in result (M) + params[1] = (uint32_t)node->ne[0]; // number of columns in result (N) + params[2] = (uint32_t)src0->ne[0]; // number of columns in src0/src1 (K) + ctx->mul_mat_params_host_buf.Unmap(); + + wgpu::BindGroupEntry entries[4]; + entries[0].binding = 0; // binding for the buffer to memset + entries[0].buffer = src0_ctx->buffer; + entries[0].offset = src0_offset; + entries[0].size = ggml_nbytes(src0); + + entries[1].binding = 1; // binding for the buffer to memset + entries[1].buffer = src1_ctx->buffer; + entries[1].offset = src1_offset; + entries[1].size = ggml_nbytes(src1); + + entries[2].binding = 2; // binding for the buffer to memset + entries[2].buffer = dst_ctx->buffer; + entries[2].offset = dst_offset; + entries[2].size = ggml_nbytes(node); + + entries[3].binding = 3; // binding for the parameters + entries[3].buffer = ctx->mul_mat_params_dev_buf; + entries[3].offset = 0; + entries[3].size = ctx->mul_mat_params_dev_buf.GetSize(); + + wgpu::BindGroupDescriptor bind_group_desc; + bind_group_desc.layout = ctx->mul_mat_pipeline.GetBindGroupLayout(0); + bind_group_desc.entryCount = 4; + bind_group_desc.entries = entries; + wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc); + + wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); + encoder.CopyBufferToBuffer( + ctx->mul_mat_params_host_buf, 0, + ctx->mul_mat_params_dev_buf, 0, + ctx->mul_mat_params_dev_buf.GetSize() + ); + wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); + pass.SetPipeline(ctx->mul_mat_pipeline); + pass.SetBindGroup(0, bind_group); + pass.DispatchWorkgroups(node->ne[0] * node->ne[1]); + pass.End(); + wgpu::CommandBuffer commands = encoder.Finish(); + + // TODO, don't submit here, batch submissions + ctx->queue.Submit(1, &commands); + return true; + } + default: return false; } @@ -230,6 +306,8 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str for (int i = 0; i < cgraph->n_nodes; i++) { ggml_webgpu_encode_node(ctx, cgraph->nodes[i]); } + + return GGML_STATUS_SUCCESS; } static ggml_backend_i ggml_backend_webgpu_i = { @@ -317,8 +395,8 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, webgpu_ctx->queue.Submit(1, &commands); // Map the staging buffer to read the data - const void * mapped_range = ggml_backend_webgpu_map_buffer(webgpu_ctx, webgpu_ctx->get_tensor_staging_buf, - wgpu::MapMode::Read, 0, size); + ggml_backend_webgpu_map_buffer(webgpu_ctx, webgpu_ctx->get_tensor_staging_buf, wgpu::MapMode::Read, 0, size); + const void * mapped_range = webgpu_ctx->get_tensor_staging_buf.GetConstMappedRange(); // Copy the data from the mapped range to the output buffer std::memcpy(data, mapped_range, size); @@ -439,7 +517,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context webgpu_ctx) { constants[0].value = max_wg_size; constants[1].key = "elems_per_thread"; constants[1].value = webgpu_ctx->memset_elems_per_thread; - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->memset_pipeline, wgsl_memset, constants); + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->memset_pipeline, wgsl_memset, "memset", constants); ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_dev_buf, 3 * sizeof(uint32_t), // 3 parameters: buffer size, offset, value wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst); @@ -447,6 +525,15 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context webgpu_ctx) { 3 * sizeof(uint32_t), wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc); } +static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context webgpu_ctx) { + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline, wgsl_mul_mat, "mul_mat"); + ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_dev_buf, + 3 * sizeof(uint32_t), // 3 parameters: M, N, K + wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst); + ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_host_buf, + 3 * sizeof(uint32_t), wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc); +} + // TODO: Does this need to be thread safe? Is it only called once? static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) { GGML_UNUSED(params); @@ -485,6 +572,7 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co webgpu_ctx->queue = webgpu_ctx->device.GetQueue(); ggml_webgpu_init_memset_pipeline(webgpu_ctx); + ggml_webgpu_init_mul_mat_pipeline(webgpu_ctx); static ggml_backend_webgpu_context backend_ctx; backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + dev_ctx->device_name; @@ -534,6 +622,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: + case GGML_OP_MUL_MAT: return true; default: @@ -654,4 +743,10 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() { return ® } +ggml_backend_t ggml_backend_webgpu_init(void) { + ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_webgpu_reg(), 0); + + return ggml_backend_webgpu_device_init(dev, nullptr); +} + GGML_BACKEND_DL_IMPL(ggml_backend_webgpu_reg) \ No newline at end of file diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl new file mode 100644 index 0000000000000..fd041f67cc3d8 --- /dev/null +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl @@ -0,0 +1,25 @@ +struct MulMatParams { + m: u32, + n: u32, + k: u32 +}; + +@group(0) @binding(0) var src0: array; +@group(0) @binding(1) var src1: array; +@group(0) @binding(2) var dst: array; + +@group(0) @binding(3) var params: MulMatParams; + +@compute @workgroup_size(64) +fn main(@builtin(global_invocation_id) global_id: vec3) { + if (global_id.x >= params.m * params.n) { + return; + } + let row = global_id.x / params.n; + let col = global_id.x % params.n; + var sum = 0.0; + for (var i: u32 = 0u; i < params.k; i = i + 1u) { + sum = sum + src0[col * params.k + i] * src1[row * params.k + i]; + } + dst[row * params.n + col] = sum; +} \ No newline at end of file From b8a220761c52250fdbfcaeb36ffcb4b76d5f1921 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 17 Jun 2025 13:31:07 -0700 Subject: [PATCH 14/25] Work on emscripten build --- ggml/src/ggml-webgpu/CMakeLists.txt | 13 +++++++++++-- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 1 - 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt index 1ce496b391afe..db53384e9b442 100644 --- a/ggml/src/ggml-webgpu/CMakeLists.txt +++ b/ggml/src/ggml-webgpu/CMakeLists.txt @@ -1,6 +1,5 @@ cmake_minimum_required(VERSION 3.13) -find_package(Dawn REQUIRED) find_package(Python3 REQUIRED) # Shader locations @@ -37,9 +36,19 @@ ggml_add_backend_library(ggml-webgpu add_dependencies(ggml-webgpu generate_shaders) +if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + set(EMDAWNWEBGPU_DIR "" CACHE PATH "Path to emdawnwebgpu_pkg") + + target_compile_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py") + target_link_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py") +else() + find_package(Dawn REQUIRED) + set(DawnWebGPU_TARGET dawn::webgpu_dawn) +endif() + if (GGML_WEBGPU_DEBUG) target_compile_definitions(ggml-webgpu PRIVATE GGML_WEBGPU_DEBUG=1) endif() target_include_directories(ggml-webgpu PRIVATE ${SHADER_OUTPUT_DIR}) -target_link_libraries(ggml-webgpu PRIVATE dawn::webgpu_dawn) +target_link_libraries(ggml-webgpu PRIVATE ${DawnWebGPU_TARGET}) \ No newline at end of file diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index d0e146385b545..8ddbb3fdf8dec 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -1,7 +1,6 @@ #include "ggml-webgpu.h" #include -#include #include "ggml-impl.h" #include "ggml-backend-impl.h" From c09bfc50a5b3b288c13062e0a3922201fae98225 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 17 Jun 2025 14:19:38 -0700 Subject: [PATCH 15/25] Basic WebGPU backend instructions --- docs/build.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/build.md b/docs/build.md index 680b0d8398741..5f735de9ed966 100644 --- a/docs/build.md +++ b/docs/build.md @@ -557,6 +557,23 @@ ninja To read documentation for how to build on Android, [click here](./android.md) +## WebGPU [In Progress] + +The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. + +In the llama.cpp directory, build with CMake: + +``` +cmake -B build -DGGML_WEBGPU=ON +cmake --build build --config Release +``` + +### Browser Support + +WebGPU allows cross-platform access to the GPU from supported browsers. We utilize [Emscripten](https://emscripten.org/) to compile ggml's WebGPU backend to WebAssembly. Emscripten does not officially support WebGPU bindings yet, but Dawn currently maintains its own WebGPU bindings called emdawnwebgpu. + +Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/src/emdawnwebgpu/) to download or build the emdawnwebgpu package (Note that it might be safer to build them locally, so that they stay in sync with the version of Dawn you have installed above). When building using CMake, the path to the emdawnwebgpu port file needs to be set with the flag `EMDAWNWEBGPU_DIR`. + ## Notes about GPU-accelerated backends The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`. From daa58e21fb56683d83fdb7134d75fb84586c9691 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 17 Jun 2025 15:24:04 -0700 Subject: [PATCH 16/25] Use EMSCRIPTEN flag --- ggml/src/ggml-webgpu/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt index db53384e9b442..0d67f11ba5520 100644 --- a/ggml/src/ggml-webgpu/CMakeLists.txt +++ b/ggml/src/ggml-webgpu/CMakeLists.txt @@ -36,7 +36,7 @@ ggml_add_backend_library(ggml-webgpu add_dependencies(ggml-webgpu generate_shaders) -if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") +if(EMSCRIPTEN) set(EMDAWNWEBGPU_DIR "" CACHE PATH "Path to emdawnwebgpu_pkg") target_compile_options(ggml-webgpu PRIVATE "--use-port=${EMDAWNWEBGPU_DIR}/emdawnwebgpu.port.py") From 1c396a263fd41a934d1d541c736591be46759a23 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Mon, 23 Jun 2025 16:42:02 -0700 Subject: [PATCH 17/25] Work on passing ci, implement 4d tensor multiplication --- README.md | 2 + ci/run.sh | 7 ++ ggml/src/ggml-webgpu/ggml-webgpu.cpp | 82 +++++++++++-------- ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl | 2 +- .../src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl | 47 +++++++++-- 5 files changed, 97 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 90c7364dfcba0..8aa254ef6b5fc 100644 --- a/README.md +++ b/README.md @@ -269,6 +269,8 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo | [Vulkan](docs/build.md#vulkan) | GPU | | [CANN](docs/build.md#cann) | Ascend NPU | | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU | +| [WebGPU [In Progress]](docs/build.md#webgpu) | All | + | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All | ## Obtaining and quantizing models diff --git a/ci/run.sh b/ci/run.sh index 94005570511b6..8117dc152c2a0 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -16,6 +16,9 @@ # # with VULKAN support # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # +# # with WebGPU support +# GG_BUILD_WEBGPU=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt +# # # with MUSA support # GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # @@ -81,6 +84,10 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1" fi +if [ ! -z ${GG_BUILD_WEBGPU} ]; then + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1" +fi + if [ ! -z ${GG_BUILD_MUSA} ]; then # Use qy1 by default (MTT S80) MUSA_ARCH=${MUSA_ARCH:-21} diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 8ddbb3fdf8dec..c1e08fd0e137a 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -16,9 +16,16 @@ #define WEBGPU_LOG_DEBUG(msg) ((void) 0) #endif // GGML_WEBGPU_DEBUG +/* Constants */ + // TODO: find a better way to get the memory available #define WEBGPU_MAX_BUFFERS 32 +#define WEBGPU_MUL_MAT_WG_SIZE 64 +#define WEBGPU_MUL_MAT_PARAMS_SIZE (7 * sizeof(uint32_t)) // M, N, K, batch sizes, broadcasts + +/* End Constants */ + // This is a "fake" base pointer, since WebGPU buffers do not have pointers to their locations. static void * const webgpu_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT @@ -138,18 +145,16 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context ctx, wgpu::Buffer buff ); } -static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer buf, uint8_t value, size_t offset, size_t size) { +static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer buf, uint32_t value, size_t offset, size_t size) { wgpu::Device device = ctx->device; // map the host parameters buffer ggml_backend_webgpu_map_buffer(ctx, ctx->memset_params_host_buf, wgpu::MapMode::Write, 0, ctx->memset_params_host_buf.GetSize()); uint32_t * params = (uint32_t *) ctx->memset_params_host_buf.GetMappedRange(); - // This is a trick to set all bytes of a u32 to the same 1 byte value. - uint32_t val32 = (uint32_t)value * 0x01010101; params[0] = (uint32_t)offset; params[1] = (uint32_t)size; - params[2] = val32; + params[2] = value; ctx->memset_params_host_buf.Unmap(); wgpu::BindGroupEntry entries[2]; @@ -191,7 +196,6 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b /** GGML Backend Interface */ static const char * ggml_backend_webgpu_name(ggml_backend_t backend) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_name()"); ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *)backend->context; return ctx->name.c_str(); } @@ -201,6 +205,7 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")"); // TODO: cleanup + GGML_UNUSED(ctx); } // Returns true if node has enqueued work into the queue, false otherwise @@ -244,6 +249,11 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ params[0] = (uint32_t)node->ne[1]; // number of rows in result (M) params[1] = (uint32_t)node->ne[0]; // number of columns in result (N) params[2] = (uint32_t)src0->ne[0]; // number of columns in src0/src1 (K) + params[3] = (uint32_t)src0->ne[2]; // batch size in dimension 2 + params[4] = (uint32_t)src0->ne[3]; // batch size in dimension 3 + params[5] = (uint32_t)(src1->ne[2]/src0->ne[2]); // broadcast in dimension 2 + params[6] = (uint32_t)(src1->ne[3]/src0->ne[3]); // broadcast in dimension 3 + ctx->mul_mat_params_host_buf.Unmap(); wgpu::BindGroupEntry entries[4]; @@ -282,7 +292,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); pass.SetPipeline(ctx->mul_mat_pipeline); pass.SetBindGroup(0, bind_group); - pass.DispatchWorkgroups(node->ne[0] * node->ne[1]); + pass.DispatchWorkgroups((node->ne[0] * node->ne[1] * node->ne[2] * node->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE); pass.End(); wgpu::CommandBuffer commands = encoder.Finish(); @@ -352,7 +362,9 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; - ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, total_offset, size); + // This is a trick to set all bytes of a u32 to the same 1 byte value. + uint32_t val32 = (uint32_t)value * 0x01010101; + ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, val32, total_offset, size); } static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { @@ -363,10 +375,21 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; // TODO: wait on this? - webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, size); + webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size/4)*4); + + if (size % 4 != 0) { + // If size is not a multiple of 4, we need to memset the remaining bytes + size_t remaining_size = size % 4; + // pack the remaining bytes into a uint32_t + uint32_t val32 = 0; + for (size_t i = 0; i < remaining_size; i++) { + ((uint8_t *)&val32)[i] = ((const uint8_t *)data)[size - remaining_size + i]; + } + // memset the remaining bytes + ggml_backend_webgpu_buffer_memset(webgpu_ctx, buf_ctx->buffer, val32, total_offset + (size - remaining_size), remaining_size); + } } -// TODO: we need a staging buffer for this, since WebGPU does not allow reading from storage buffers directly. static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")"); @@ -376,33 +399,39 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; + size_t final_size = size; + if (size % 4 != 0) { + // If size is not a multiple of 4, we need to round it up to the next multiple of 4 + final_size = size + (4 - (size % 4)); + } + if (webgpu_ctx->get_tensor_staging_buf == nullptr || - webgpu_ctx->get_tensor_staging_buf.GetSize() < size) { + webgpu_ctx->get_tensor_staging_buf.GetSize() < final_size) { // Create a new staging buffer if it doesn't exist or is too small if (webgpu_ctx->get_tensor_staging_buf) { webgpu_ctx->get_tensor_staging_buf.Destroy(); } - ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, size, + ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, final_size, wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead); } // Copy the data from the buffer to the staging buffer wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); - encoder.CopyBufferToBuffer(buf_ctx->buffer, total_offset, webgpu_ctx->get_tensor_staging_buf, 0, size); + encoder.CopyBufferToBuffer(buf_ctx->buffer, total_offset, webgpu_ctx->get_tensor_staging_buf, 0, final_size); wgpu::CommandBuffer commands = encoder.Finish(); // Submit the command buffer to the queue webgpu_ctx->queue.Submit(1, &commands); // Map the staging buffer to read the data - ggml_backend_webgpu_map_buffer(webgpu_ctx, webgpu_ctx->get_tensor_staging_buf, wgpu::MapMode::Read, 0, size); - const void * mapped_range = webgpu_ctx->get_tensor_staging_buf.GetConstMappedRange(); + ggml_backend_webgpu_map_buffer(webgpu_ctx, webgpu_ctx->get_tensor_staging_buf, wgpu::MapMode::Read, 0, final_size); + // Must specify size here since the staging buffer might be larger than the tensor size + const void * mapped_range = webgpu_ctx->get_tensor_staging_buf.GetConstMappedRange(0, final_size); // Copy the data from the mapped range to the output buffer std::memcpy(data, mapped_range, size); webgpu_ctx->get_tensor_staging_buf.Unmap(); } -// TODO static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << value << ")"); @@ -427,7 +456,6 @@ static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = { /* GGML Backend Buffer Type Interface */ static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_get_name()"); ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); return ctx->device_name.c_str(); } @@ -446,14 +474,12 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b } static size_t ggml_backend_webgpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_get_alignment()"); ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); return ctx->webgpu_ctx->limits.minStorageBufferOffsetAlignment; } // maxBufferSize might be larger, but you can't bind more than maxStorageBufferBindingSize to a single binding. static size_t ggml_backend_webgpu_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_get_max_size()"); ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); return ctx->webgpu_ctx->limits.maxStorageBufferBindingSize; } @@ -473,8 +499,6 @@ static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_ } static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_get_memory()"); - ggml_backend_webgpu_device_context * ctx = static_cast(dev->context); // TODO: what do we actually want to return here? *free = ctx->webgpu_ctx->limits.maxBufferSize * WEBGPU_MAX_BUFFERS; @@ -482,7 +506,6 @@ static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t } static enum ggml_backend_dev_type ggml_backend_webgpu_device_get_type(ggml_backend_dev_t dev) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_device_get_type()"); GGML_UNUSED(dev); return GGML_BACKEND_DEVICE_TYPE_GPU; } @@ -526,11 +549,10 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context webgpu_ctx) { static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context webgpu_ctx) { ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline, wgsl_mul_mat, "mul_mat"); - ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_dev_buf, - 3 * sizeof(uint32_t), // 3 parameters: M, N, K + ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_dev_buf, WEBGPU_MUL_MAT_PARAMS_SIZE, wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst); - ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_host_buf, - 3 * sizeof(uint32_t), wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc); + ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_host_buf,WEBGPU_MUL_MAT_PARAMS_SIZE, + wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc); } // TODO: Does this need to be thread safe? Is it only called once? @@ -617,13 +639,9 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const // what should we support first? switch (op->op) { case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - case GGML_OP_MUL_MAT: return true; - + case GGML_OP_MUL_MAT: + return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; default: return false; } @@ -652,13 +670,11 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = { /* GGML Backend Registration Interface */ static const char * ggml_backend_webgpu_reg_get_name(ggml_backend_reg_t reg) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg_get_name()"); ggml_backend_webgpu_reg_context * ctx = static_cast(reg->context); return ctx->name; } static size_t ggml_backend_webgpu_reg_get_device_count(ggml_backend_reg_t reg) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg_get_device_count()"); ggml_backend_webgpu_reg_context * ctx = static_cast(reg->context); return ctx->device_count; } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl index 32c0b2b67d5a2..60142c4fa0839 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl @@ -5,7 +5,7 @@ var output_buffer: array; struct Params { offset: u32, // in bytes size: u32, // in bytes - value: u32, // four identical values + value: u32, // 4 8-bit values, which are either repeating (memset_tensor) or may be separate (cleaning up unaligned set_tensor operations) }; @group(0) @binding(1) diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl index fd041f67cc3d8..31973278ee18f 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl @@ -1,25 +1,54 @@ struct MulMatParams { m: u32, n: u32, - k: u32 + k: u32, + bs02: u32, + bs03: u32, + broadcast2: u32, + broadcast3: u32 }; -@group(0) @binding(0) var src0: array; -@group(0) @binding(1) var src1: array; -@group(0) @binding(2) var dst: array; +@group(0) @binding(0) var src0: array; // N rows, K columns +@group(0) @binding(1) var src1: array; // M rows, K columns +@group(0) @binding(2) var dst: array; // M rows, N columns @group(0) @binding(3) var params: MulMatParams; @compute @workgroup_size(64) fn main(@builtin(global_invocation_id) global_id: vec3) { - if (global_id.x >= params.m * params.n) { + let total = params.m * params.n * params.bs02 * params.broadcast2 * params.bs03 * params.broadcast3; + if (global_id.x >= total) { return; } - let row = global_id.x / params.n; - let col = global_id.x % params.n; + + let src02_stride = params.n * params.k; + let src03_stride = src02_stride * params.bs02; + + let src12_stride = params.m * params.k; + let src13_stride = src12_stride * params.bs02 * params.broadcast2; + + let dst2_stride = params.m * params.n; + let dst3_stride = dst2_stride * params.bs02 * params.broadcast2; + + let dst3_idx = global_id.x / dst3_stride; + let src03_idx = dst3_idx / params.broadcast3; // src0 may be broadcast along the third dimension + let src13_idx = dst3_idx; // src1 is not broadcast + let dst3_rem = global_id.x % dst3_stride; + + let dst2_idx = dst3_rem / dst2_stride; + let src02_idx = dst2_idx / params.broadcast2; // src0 may also be broadcast along the second dimension + let src12_idx = dst2_idx; + + let dst2_rem = dst3_rem % dst2_stride; + + let row = dst2_rem / params.n; // output row + let col = dst2_rem % params.n; // output column + var sum = 0.0; for (var i: u32 = 0u; i < params.k; i = i + 1u) { - sum = sum + src0[col * params.k + i] * src1[row * params.k + i]; + let src0_idx = src03_idx * src03_stride + src02_idx * src02_stride + col * params.k + i; + let src1_idx = src13_idx * src13_stride + src12_idx * src12_stride + row * params.k + i; + sum = sum + src0[src0_idx] * src1[src1_idx]; } - dst[row * params.n + col] = sum; + dst[dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum; } \ No newline at end of file From ecb945ebbd36bf4a2e8fd5db59acfc5419a10f72 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 24 Jun 2025 17:16:37 -0700 Subject: [PATCH 18/25] Pass thread safety test --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 236 +++++++++++++----- ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl | 32 +++ ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl | 7 +- 3 files changed, 212 insertions(+), 63 deletions(-) create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index c1e08fd0e137a..d8aa0206bbab8 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -23,6 +23,8 @@ #define WEBGPU_MUL_MAT_WG_SIZE 64 #define WEBGPU_MUL_MAT_PARAMS_SIZE (7 * sizeof(uint32_t)) // M, N, K, batch sizes, broadcasts +#define WEBGPU_CPY_PARAMS_SIZE (3 * sizeof(uint32_t)) // number of elements to copy, alignments +#define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4 /* End Constants */ @@ -46,6 +48,10 @@ struct webgpu_context_struct { wgpu::Device device; wgpu::Queue queue; wgpu::Limits limits; + wgpu::SupportedFeatures features; + + std::mutex mutex; + bool device_initialized = false; // pipelines and parameter buffers // TODO: reuse params buffers for different pipelines when possible @@ -55,8 +61,11 @@ struct webgpu_context_struct { wgpu::ComputePipeline mul_mat_pipeline; wgpu::Buffer mul_mat_params_dev_buf; wgpu::Buffer mul_mat_params_host_buf; + wgpu::ComputePipeline cpy_pipeline; + wgpu::Buffer cpy_params_dev_buf; + wgpu::Buffer cpy_params_host_buf; - size_t memset_elems_per_thread; + size_t memset_bytes_per_thread; // Staging buffer for reading data from the GPU wgpu::Buffer get_tensor_staging_buf; @@ -118,12 +127,13 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &device, wgpu::ComputePipel pipeline = device.CreateComputePipeline(&pipeline_desc); } -static void ggml_webgpu_create_buffer(wgpu::Device &device, wgpu::Buffer &buffer, size_t size, wgpu::BufferUsage usage) { +static void ggml_webgpu_create_buffer(wgpu::Device &device, wgpu::Buffer &buffer, size_t size, wgpu::BufferUsage usage, const char* label) { WEBGPU_LOG_DEBUG("ggml_webgpu_create_buffer()"); wgpu::BufferDescriptor buffer_desc; buffer_desc.size = size; buffer_desc.usage = usage; + buffer_desc.label = label; buffer_desc.mappedAtCreation = false; // TODO: error handling buffer = device.CreateBuffer(&buffer_desc); @@ -146,6 +156,7 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context ctx, wgpu::Buffer buff } static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer buf, uint32_t value, size_t offset, size_t size) { + std::lock_guard lock(ctx->mutex); wgpu::Device device = ctx->device; // map the host parameters buffer @@ -170,6 +181,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b wgpu::BindGroupDescriptor bind_group_desc; bind_group_desc.layout = ctx->memset_pipeline.GetBindGroupLayout(0); bind_group_desc.entryCount = 2; + bind_group_desc.label = "ggml_memset"; bind_group_desc.entries = entries; wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc); @@ -182,8 +194,8 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); pass.SetPipeline(ctx->memset_pipeline); pass.SetBindGroup(0, bind_group); - size_t elems_per_wg = ctx->limits.maxComputeWorkgroupSizeX * ctx->memset_elems_per_thread; - pass.DispatchWorkgroups((((size + 3)/4) + elems_per_wg - 1) / elems_per_wg, 1, 1); + size_t bytes_per_wg = ctx->limits.maxComputeWorkgroupSizeX * ctx->memset_bytes_per_thread; + pass.DispatchWorkgroups(((size + 3) + bytes_per_wg - 1) / bytes_per_wg, 1, 1); pass.End(); wgpu::CommandBuffer commands = encoder.Finish(); @@ -191,6 +203,18 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b ctx->queue.Submit(1, &commands); } +static void ggml_backend_webgpu_wait_on_submission(webgpu_context ctx) { + // Wait for the queue to finish processing all commands + ctx->instance.WaitAny(ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::WaitAnyOnly, + [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) { + if (status != wgpu::QueueWorkDoneStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to wait on queue: %s\n", message.data); + } + }), + UINT64_MAX + ); +} + /** End WebGPU Actions */ /** GGML Backend Interface */ @@ -218,17 +242,82 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ switch (node->op) { - // no-op + // no-ops case GGML_OP_NONE: - // these next four ops modify the logical view of the tensor, but do not change its data - case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: return false; + + case GGML_OP_CPY: { + std::lock_guard lock(ctx->mutex); + const ggml_tensor * src = node->src[0]; + ggml_backend_webgpu_buffer_context * src_ctx = (ggml_backend_webgpu_buffer_context *) src->buffer->context; + size_t src_offset = webgpu_tensor_offset(src) + src->view_offs; + // assumes power of 2 offset alignment + size_t src_misalignment = src_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); + // align to minimum offset alignment + src_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); + ggml_backend_webgpu_buffer_context * dst_ctx = (ggml_backend_webgpu_buffer_context *) node->buffer->context; + size_t dst_offset = webgpu_tensor_offset(node) + node->view_offs; + size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); + dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); + + wgpu::Device device = ctx->device; + ggml_backend_webgpu_map_buffer(ctx, ctx->cpy_params_host_buf, + wgpu::MapMode::Write, 0, ctx->cpy_params_host_buf.GetSize()); + uint32_t * params = (uint32_t *) ctx->cpy_params_host_buf.GetMappedRange(); + uint32_t ne = (uint32_t)ggml_nelements(node); // number of elements to copy + params[0] = ne; + params[1] = src_misalignment; + params[2] = dst_misalignment; + ctx->cpy_params_host_buf.Unmap(); + + wgpu::BindGroupEntry entries[3]; + entries[0].binding = 0; + entries[0].buffer = src_ctx->buffer; + entries[0].offset = src_offset; + entries[0].size = (ggml_nbytes(src) + src_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1); + + entries[1].binding = 1; + entries[1].buffer = dst_ctx->buffer; + entries[1].offset = dst_offset; + entries[1].size = (ggml_nbytes(node) + dst_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1); + + entries[2].binding = 2; + entries[2].buffer = ctx->cpy_params_dev_buf; + entries[2].offset = 0; + entries[2].size = ctx->cpy_params_dev_buf.GetSize(); + + wgpu::BindGroupDescriptor bind_group_desc; + bind_group_desc.layout = ctx->cpy_pipeline.GetBindGroupLayout(0); + bind_group_desc.label = "ggml_op_cpy"; + bind_group_desc.entryCount = 3; + bind_group_desc.entries = entries; + wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc); + + wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); + encoder.CopyBufferToBuffer( + ctx->cpy_params_host_buf, 0, + ctx->cpy_params_dev_buf, 0, + ctx->cpy_params_dev_buf.GetSize() + ); + wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); + pass.SetPipeline(ctx->cpy_pipeline); + pass.SetBindGroup(0, bind_group); + size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX; + pass.DispatchWorkgroups((ne + max_wg_size - 1) / max_wg_size); + pass.End(); + wgpu::CommandBuffer commands = encoder.Finish(); + + // TODO, don't submit here, batch submissions + ctx->queue.Submit(1, &commands); + // TODO, don't wait on submission here + ggml_backend_webgpu_wait_on_submission(ctx); + return true; + } - // basic matrix multiplication for now, 2d tensors only - case GGML_OP_MUL_MAT: { + case GGML_OP_MUL_MAT: + { const ggml_tensor * src0 = node->src[0]; ggml_backend_webgpu_buffer_context * src0_ctx = (ggml_backend_webgpu_buffer_context *) src0->buffer->context; size_t src0_offset = webgpu_tensor_offset(src0) + src0->view_offs; @@ -257,22 +346,22 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ ctx->mul_mat_params_host_buf.Unmap(); wgpu::BindGroupEntry entries[4]; - entries[0].binding = 0; // binding for the buffer to memset + entries[0].binding = 0; entries[0].buffer = src0_ctx->buffer; entries[0].offset = src0_offset; entries[0].size = ggml_nbytes(src0); - entries[1].binding = 1; // binding for the buffer to memset + entries[1].binding = 1; entries[1].buffer = src1_ctx->buffer; entries[1].offset = src1_offset; entries[1].size = ggml_nbytes(src1); - entries[2].binding = 2; // binding for the buffer to memset + entries[2].binding = 2; entries[2].buffer = dst_ctx->buffer; entries[2].offset = dst_offset; entries[2].size = ggml_nbytes(node); - entries[3].binding = 3; // binding for the parameters + entries[3].binding = 3; entries[3].buffer = ctx->mul_mat_params_dev_buf; entries[3].offset = 0; entries[3].size = ctx->mul_mat_params_dev_buf.GetSize(); @@ -280,6 +369,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ wgpu::BindGroupDescriptor bind_group_desc; bind_group_desc.layout = ctx->mul_mat_pipeline.GetBindGroupLayout(0); bind_group_desc.entryCount = 4; + bind_group_desc.label = "ggml_op_mul_mat"; bind_group_desc.entries = entries; wgpu::BindGroup bind_group = device.CreateBindGroup(&bind_group_desc); @@ -298,6 +388,8 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ // TODO, don't submit here, batch submissions ctx->queue.Submit(1, &commands); + // TODO, don't wait on submission here + ggml_backend_webgpu_wait_on_submission(ctx); return true; } @@ -343,7 +435,6 @@ static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_free_buffer()"); ggml_backend_webgpu_buffer_context * ctx = static_cast(buffer->context); ctx->buffer.Destroy(); - delete ctx; } // Returns the "fake" base pointer. @@ -405,6 +496,8 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, final_size = size + (4 - (size % 4)); } + std::lock_guard lock(webgpu_ctx->mutex); + if (webgpu_ctx->get_tensor_staging_buf == nullptr || webgpu_ctx->get_tensor_staging_buf.GetSize() < final_size) { // Create a new staging buffer if it doesn't exist or is too small @@ -412,7 +505,7 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, webgpu_ctx->get_tensor_staging_buf.Destroy(); } ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, final_size, - wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead); + wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "get_tensor_staging_buf"); } // Copy the data from the buffer to the staging buffer @@ -433,10 +526,10 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, } static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << value << ")"); + WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << (uint32_t) value << ")"); ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context; - ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, 0, buf_ctx->buffer.GetSize()); + ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, 0, buffer->size); } static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = { @@ -466,7 +559,7 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b wgpu::Buffer buf; ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf, size, - wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst); + wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst, "allocated_buffer"); ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf); @@ -532,27 +625,39 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context webgpu_ctx) { // we use the maximum workgroup size for the memset pipeline size_t max_wg_size = webgpu_ctx->limits.maxComputeWorkgroupSizeX; size_t max_threads = max_wg_size * webgpu_ctx->limits.maxComputeWorkgroupsPerDimension; - // Size the elems_per_thread so that the largest buffer size can be handled - webgpu_ctx->memset_elems_per_thread = (webgpu_ctx->limits.maxStorageBufferBindingSize / sizeof(uint32_t) + max_threads - 1) / max_threads; + // Size the bytes_per_thread so that the largest buffer size can be handled + webgpu_ctx->memset_bytes_per_thread = (webgpu_ctx->limits.maxStorageBufferBindingSize + max_threads - 1) / max_threads; std::vector constants(2); constants[0].key = "wg_size"; constants[0].value = max_wg_size; - constants[1].key = "elems_per_thread"; - constants[1].value = webgpu_ctx->memset_elems_per_thread; + constants[1].key = "bytes_per_thread"; + constants[1].value = webgpu_ctx->memset_bytes_per_thread; ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->memset_pipeline, wgsl_memset, "memset", constants); ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_dev_buf, 3 * sizeof(uint32_t), // 3 parameters: buffer size, offset, value - wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst); + wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst, "memset_params_dev_buf"); ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_host_buf, - 3 * sizeof(uint32_t), wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc); + 3 * sizeof(uint32_t), wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc, "memset_params_host_buf"); } static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context webgpu_ctx) { ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline, wgsl_mul_mat, "mul_mat"); ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_dev_buf, WEBGPU_MUL_MAT_PARAMS_SIZE, - wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst); - ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_host_buf,WEBGPU_MUL_MAT_PARAMS_SIZE, - wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc); + wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst, "mul_mat_params_dev_buf"); + ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->mul_mat_params_host_buf, WEBGPU_MUL_MAT_PARAMS_SIZE, + wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc, "mul_mat_params_host_buf"); +} + +static void ggml_webgpu_init_cpy_pipeline(webgpu_context webgpu_ctx) { + std::vector constants(1); + constants[0].key = "wg_size"; + constants[0].value = webgpu_ctx->limits.maxComputeWorkgroupSizeX; + + ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline, wgsl_cpy, "cpy", constants); + ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->cpy_params_dev_buf, WEBGPU_CPY_PARAMS_SIZE, + wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst, "cpy_params_dev_buf"); + ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->cpy_params_host_buf, WEBGPU_CPY_PARAMS_SIZE, + wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc, "cpy_params_host_buf"); } // TODO: Does this need to be thread safe? Is it only called once? @@ -564,36 +669,44 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co ggml_backend_webgpu_device_context * dev_ctx = static_cast(dev->context); webgpu_context webgpu_ctx = dev_ctx->webgpu_ctx; - // Initialize device - wgpu::DeviceDescriptor dev_desc; - dev_desc.requiredLimits = &webgpu_ctx->limits; - dev_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, - [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) { - GGML_UNUSED(device); - GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast(reason), message.data); - }); - dev_desc.SetUncapturedErrorCallback( - [](const wgpu::Device& device, wgpu::ErrorType reason, wgpu::StringView message) { - GGML_UNUSED(device); - GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), message.data); - }); - webgpu_ctx->instance.WaitAny(webgpu_ctx->adapter.RequestDevice(&dev_desc, wgpu::CallbackMode::WaitAnyOnly, - [webgpu_ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) { - if (status != wgpu::RequestDeviceStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data); - return; - } - webgpu_ctx->device = device; - }), - UINT64_MAX - ); - GGML_ASSERT(webgpu_ctx->device != nullptr); - - // Initialize (compute) queue - webgpu_ctx->queue = webgpu_ctx->device.GetQueue(); - - ggml_webgpu_init_memset_pipeline(webgpu_ctx); - ggml_webgpu_init_mul_mat_pipeline(webgpu_ctx); + std::lock_guard lock(webgpu_ctx->mutex); + + if (!webgpu_ctx->device_initialized) { + // Initialize device + wgpu::DeviceDescriptor dev_desc; + dev_desc.requiredLimits = &webgpu_ctx->limits; + dev_desc.requiredFeatures = webgpu_ctx->features.features; + dev_desc.requiredFeatureCount = webgpu_ctx->features.featureCount; + dev_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, + [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) { + GGML_UNUSED(device); + GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast(reason), message.data); + }); + dev_desc.SetUncapturedErrorCallback( + [](const wgpu::Device& device, wgpu::ErrorType reason, wgpu::StringView message) { + GGML_UNUSED(device); + GGML_LOG_ERROR("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), message.data); + }); + webgpu_ctx->instance.WaitAny(webgpu_ctx->adapter.RequestDevice(&dev_desc, wgpu::CallbackMode::WaitAnyOnly, + [webgpu_ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) { + if (status != wgpu::RequestDeviceStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data); + return; + } + webgpu_ctx->device = device; + }), + UINT64_MAX + ); + GGML_ASSERT(webgpu_ctx->device != nullptr); + + // Initialize (compute) queue + webgpu_ctx->queue = webgpu_ctx->device.GetQueue(); + + ggml_webgpu_init_memset_pipeline(webgpu_ctx); + ggml_webgpu_init_mul_mat_pipeline(webgpu_ctx); + ggml_webgpu_init_cpy_pipeline(webgpu_ctx); + webgpu_ctx->device_initialized = true; + } static ggml_backend_webgpu_context backend_ctx; backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + dev_ctx->device_name; @@ -636,10 +749,13 @@ static bool ggml_backend_webgpu_device_supports_buft(ggml_backend_dev_t dev, ggm static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_UNUSED(dev); - // what should we support first? switch (op->op) { case GGML_OP_NONE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: return true; + case GGML_OP_CPY: + return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32; case GGML_OP_MUL_MAT: return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; default: @@ -702,6 +818,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t GGML_ASSERT(ctx->adapter != nullptr); ctx->adapter.GetLimits(&ctx->limits); + ctx->adapter.GetFeatures(&ctx->features); wgpu::AdapterInfo info{}; ctx->adapter.GetInfo(&info); @@ -738,6 +855,7 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_reg()"); webgpu_context webgpu_ctx = std::make_shared(); + webgpu_ctx->device_initialized = false; static ggml_backend_webgpu_reg_context ctx; ctx.webgpu_ctx = webgpu_ctx; diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl new file mode 100644 index 0000000000000..fb14916578f57 --- /dev/null +++ b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl @@ -0,0 +1,32 @@ +enable f16; + +@group(0) @binding(0) +var src: array; + +@group(0) @binding(1) +var dst: array; + +struct Params { + ne: u32, // number of elements + src_offset: u32, // src offset in bytes + dst_offset: u32 // dst offset in bytes +}; + +@group(0) @binding(2) +var params: Params; + +override wg_size: u32; +const elems_per_thread: u32 = 4; + +@compute @workgroup_size(wg_size) +fn main(@builtin(global_invocation_id) gid: vec3) { + let idx = gid.x * elems_per_thread; + // chunked loop + for (var j: u32 = 0u; j < elems_per_thread; j = j + 1u) { + let i = idx + j; + if (i < params.ne) { + // Convert f32 to f16 + dst[dst_offset/2 + i] = f16(src[src_offset/4 + i]); + } + } +} diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl index 60142c4fa0839..cb7c8c3e09e91 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl @@ -1,4 +1,3 @@ -// memset.wgsl @group(0) @binding(0) var output_buffer: array; @@ -12,15 +11,15 @@ struct Params { var params: Params; override wg_size: u32; -override elems_per_thread: u32; +override bytes_per_thread: u32; @compute @workgroup_size(wg_size) fn main(@builtin(global_invocation_id) gid: vec3) { - let i = gid.x * 4u; + let i = gid.x * bytes_per_thread; let start = params.offset; let end = params.offset + params.size; - for (var j: u32 = 0u; j < elems_per_thread; j = j + 1u) { + for (var j: u32 = 0u; j < bytes_per_thread; j = j + 1u) { let byte_index = start + i + j; if (byte_index + 4u <= end) { output_buffer[(byte_index >> 2u)] = params.value; From 0f0543b9f95afaea8c16a78b6a4eed0a8df421d5 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 2 Jul 2025 17:01:22 -0700 Subject: [PATCH 19/25] Implement permuting for mul_mat and cpy --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 38 ++++++++++--- ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl | 54 ++++++++++++++----- .../src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl | 19 +++---- 3 files changed, 82 insertions(+), 29 deletions(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index d8aa0206bbab8..8602baf9c15d6 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -22,8 +22,8 @@ #define WEBGPU_MAX_BUFFERS 32 #define WEBGPU_MUL_MAT_WG_SIZE 64 -#define WEBGPU_MUL_MAT_PARAMS_SIZE (7 * sizeof(uint32_t)) // M, N, K, batch sizes, broadcasts -#define WEBGPU_CPY_PARAMS_SIZE (3 * sizeof(uint32_t)) // number of elements to copy, alignments +#define WEBGPU_MUL_MAT_PARAMS_SIZE (13 * sizeof(uint32_t)) // M, N, K, batch sizes, broadcasts +#define WEBGPU_CPY_PARAMS_SIZE (15 * sizeof(uint32_t)) // strides and offsets #define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4 /* End Constants */ @@ -266,10 +266,26 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ ggml_backend_webgpu_map_buffer(ctx, ctx->cpy_params_host_buf, wgpu::MapMode::Write, 0, ctx->cpy_params_host_buf.GetSize()); uint32_t * params = (uint32_t *) ctx->cpy_params_host_buf.GetMappedRange(); - uint32_t ne = (uint32_t)ggml_nelements(node); // number of elements to copy + uint32_t ne = (uint32_t)ggml_nelements(node); params[0] = ne; params[1] = src_misalignment; params[2] = dst_misalignment; + + // Convert byte-strides to element-strides + params[3] = (uint32_t)src->nb[0]/ggml_type_size(src->type); + params[4] = (uint32_t)src->nb[1]/ggml_type_size(src->type); + params[5] = (uint32_t)src->nb[2]/ggml_type_size(src->type); + params[6] = (uint32_t)src->nb[3]/ggml_type_size(src->type); + params[7] = (uint32_t)node->nb[0]/ggml_type_size(node->type); + params[8] = (uint32_t)node->nb[1]/ggml_type_size(node->type); + params[9] = (uint32_t)node->nb[2]/ggml_type_size(node->type); + params[10] = (uint32_t)node->nb[3]/ggml_type_size(node->type); + // Logical shape — same for both tensors even if permuted + params[11] = (uint32_t)(src->ne[0]); + params[12] = (uint32_t)(src->ne[1]); + params[13] = (uint32_t)(src->ne[2]); + params[14] = (uint32_t)(src->ne[3]); + ctx->cpy_params_host_buf.Unmap(); wgpu::BindGroupEntry entries[3]; @@ -338,10 +354,18 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ params[0] = (uint32_t)node->ne[1]; // number of rows in result (M) params[1] = (uint32_t)node->ne[0]; // number of columns in result (N) params[2] = (uint32_t)src0->ne[0]; // number of columns in src0/src1 (K) - params[3] = (uint32_t)src0->ne[2]; // batch size in dimension 2 - params[4] = (uint32_t)src0->ne[3]; // batch size in dimension 3 - params[5] = (uint32_t)(src1->ne[2]/src0->ne[2]); // broadcast in dimension 2 - params[6] = (uint32_t)(src1->ne[3]/src0->ne[3]); // broadcast in dimension 3 + + params[3] = (uint32_t)src0->nb[1]/ggml_type_size(src0->type); // stride (elements) of src0 in dimension 1 + params[4] = (uint32_t)src1->nb[1]/ggml_type_size(src1->type); // stride (elements) of src1 in dimension 1 + params[5] = (uint32_t)src0->nb[2]/ggml_type_size(src0->type); // stride (elements) of src0 in dimension 2 + params[6] = (uint32_t)src1->nb[2]/ggml_type_size(src1->type); // stride (elements) of src1 in dimension 2 + params[7] = (uint32_t)src0->nb[3]/ggml_type_size(src0->type); // stride (elements) of src0 in dimension 3 + params[8] = (uint32_t)src1->nb[3]/ggml_type_size(src1->type); // stride (elements) of src1 in dimension 3 + + params[9] = (uint32_t)src0->ne[2]; // batch size in dimension 2 + params[10] = (uint32_t)src0->ne[3]; // batch size in dimension 3 + params[11] = (uint32_t)(src1->ne[2]/src0->ne[2]); // broadcast in dimension 2 + params[12] = (uint32_t)(src1->ne[3]/src0->ne[3]); // broadcast in dimension 3 ctx->mul_mat_params_host_buf.Unmap(); diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl index fb14916578f57..6b18d68094831 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl @@ -7,26 +7,54 @@ var src: array; var dst: array; struct Params { - ne: u32, // number of elements - src_offset: u32, // src offset in bytes - dst_offset: u32 // dst offset in bytes + ne: u32, // total number of elements + src_offset: u32, // in bytes + dst_offset: u32, // in bytes + + // Strides (in elements) — may be permuted + stride_src0: u32, + stride_src1: u32, + stride_src2: u32, + stride_src3: u32, + + stride_dst0: u32, + stride_dst1: u32, + stride_dst2: u32, + stride_dst3: u32, + + // Logical shape (same for both tensors) + ne0: u32, + ne1: u32, + ne2: u32, + ne3: u32, }; @group(0) @binding(2) var params: Params; override wg_size: u32; -const elems_per_thread: u32 = 4; - @compute @workgroup_size(wg_size) fn main(@builtin(global_invocation_id) gid: vec3) { - let idx = gid.x * elems_per_thread; - // chunked loop - for (var j: u32 = 0u; j < elems_per_thread; j = j + 1u) { - let i = idx + j; - if (i < params.ne) { - // Convert f32 to f16 - dst[dst_offset/2 + i] = f16(src[src_offset/4 + i]); - } + if (gid.x >= params.ne) { + return; } + + var i = gid.x; + + let i3 = i / (params.ne2 * params.ne1 * params.ne0); + i = i % (params.ne2 * params.ne1 * params.ne0); + + let i2 = i / (params.ne1 * params.ne0); + i = i % (params.ne1 * params.ne0); + + let i1 = i / params.ne0; + let i0 = i % params.ne0; + + let src_idx = i0 * params.stride_src0 + i1 * params.stride_src1 + + i2 * params.stride_src2 + i3 * params.stride_src3; + + let dst_idx = i0 * params.stride_dst0 + i1 * params.stride_dst1 + + i2 * params.stride_dst2 + i3 * params.stride_dst3; + + dst[params.dst_offset / 2 + dst_idx] = f16(src[params.src_offset / 4 + src_idx]); } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl index 31973278ee18f..3b8b31474ca90 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl @@ -2,6 +2,13 @@ struct MulMatParams { m: u32, n: u32, k: u32, + stride_01: u32, + stride_11: u32, + stride_02: u32, + stride_12: u32, + stride_03: u32, + stride_13: u32, + bs02: u32, bs03: u32, broadcast2: u32, @@ -21,12 +28,6 @@ fn main(@builtin(global_invocation_id) global_id: vec3) { return; } - let src02_stride = params.n * params.k; - let src03_stride = src02_stride * params.bs02; - - let src12_stride = params.m * params.k; - let src13_stride = src12_stride * params.bs02 * params.broadcast2; - let dst2_stride = params.m * params.n; let dst3_stride = dst2_stride * params.bs02 * params.broadcast2; @@ -37,7 +38,7 @@ fn main(@builtin(global_invocation_id) global_id: vec3) { let dst2_idx = dst3_rem / dst2_stride; let src02_idx = dst2_idx / params.broadcast2; // src0 may also be broadcast along the second dimension - let src12_idx = dst2_idx; + let src12_idx = dst2_idx; // src1 is not broadcast let dst2_rem = dst3_rem % dst2_stride; @@ -46,8 +47,8 @@ fn main(@builtin(global_invocation_id) global_id: vec3) { var sum = 0.0; for (var i: u32 = 0u; i < params.k; i = i + 1u) { - let src0_idx = src03_idx * src03_stride + src02_idx * src02_stride + col * params.k + i; - let src1_idx = src13_idx * src13_stride + src12_idx * src12_stride + row * params.k + i; + let src0_idx = src03_idx * params.stride_03 + src02_idx * params.stride_02 + col * params.stride_01 + i; + let src1_idx = src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11 + i; sum = sum + src0[src0_idx] * src1[src1_idx]; } dst[dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum; From 2eb7626dbbe4dff7c5c59f95ebbe7b9b918191bf Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Thu, 3 Jul 2025 11:48:22 -0700 Subject: [PATCH 20/25] minor cleanups --- docs/build.md | 2 +- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 13 ++++--------- ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl | 6 +++--- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/docs/build.md b/docs/build.md index 5f735de9ed966..1b6fa8733c93c 100644 --- a/docs/build.md +++ b/docs/build.md @@ -572,7 +572,7 @@ cmake --build build --config Release WebGPU allows cross-platform access to the GPU from supported browsers. We utilize [Emscripten](https://emscripten.org/) to compile ggml's WebGPU backend to WebAssembly. Emscripten does not officially support WebGPU bindings yet, but Dawn currently maintains its own WebGPU bindings called emdawnwebgpu. -Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/src/emdawnwebgpu/) to download or build the emdawnwebgpu package (Note that it might be safer to build them locally, so that they stay in sync with the version of Dawn you have installed above). When building using CMake, the path to the emdawnwebgpu port file needs to be set with the flag `EMDAWNWEBGPU_DIR`. +Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/src/emdawnwebgpu/) to download or build the emdawnwebgpu package (Note that it might be safer to build the emdawbwebgpu package locally, so that it stays in sync with the version of Dawn you have installed above). When building using CMake, the path to the emdawnwebgpu port file needs to be set with the flag `EMDAWNWEBGPU_DIR`. ## Notes about GPU-accelerated backends diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 8602baf9c15d6..c072f279dcb63 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -18,9 +18,6 @@ /* Constants */ -// TODO: find a better way to get the memory available -#define WEBGPU_MAX_BUFFERS 32 - #define WEBGPU_MUL_MAT_WG_SIZE 64 #define WEBGPU_MUL_MAT_PARAMS_SIZE (13 * sizeof(uint32_t)) // M, N, K, batch sizes, broadcasts #define WEBGPU_CPY_PARAMS_SIZE (15 * sizeof(uint32_t)) // strides and offsets @@ -119,7 +116,7 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &device, wgpu::ComputePipel pipeline_desc.label = label; pipeline_desc.compute.module = shader_module; pipeline_desc.compute.entryPoint = "main"; // Entry point in the WGSL code - pipeline_desc.layout = nullptr; // Guessing that nullptr means auto layout + pipeline_desc.layout = nullptr; // nullptr means auto layout if (constants.size() > 0) { pipeline_desc.compute.constants = constants.data(); pipeline_desc.compute.constantCount = constants.size(); @@ -199,7 +196,6 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b pass.End(); wgpu::CommandBuffer commands = encoder.Finish(); - // TODO, async, do we need to wait on this? ctx->queue.Submit(1, &commands); } @@ -489,7 +485,6 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer, size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset; - // TODO: wait on this? webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size/4)*4); if (size % 4 != 0) { @@ -617,9 +612,9 @@ static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_ static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { ggml_backend_webgpu_device_context * ctx = static_cast(dev->context); - // TODO: what do we actually want to return here? - *free = ctx->webgpu_ctx->limits.maxBufferSize * WEBGPU_MAX_BUFFERS; - *total = ctx->webgpu_ctx->limits.maxBufferSize * WEBGPU_MAX_BUFFERS; + // TODO: what do we actually want to return here? maxBufferSize might not be the full available memory. + *free = ctx->webgpu_ctx->limits.maxBufferSize; + *total = ctx->webgpu_ctx->limits.maxBufferSize; } static enum ggml_backend_dev_type ggml_backend_webgpu_device_get_type(ggml_backend_dev_t dev) { diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl index 6b18d68094831..8c05312425c0b 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl @@ -8,8 +8,8 @@ var dst: array; struct Params { ne: u32, // total number of elements - src_offset: u32, // in bytes - dst_offset: u32, // in bytes + offset_src: u32, // in bytes + offset_dst: u32, // in bytes // Strides (in elements) — may be permuted stride_src0: u32, @@ -56,5 +56,5 @@ fn main(@builtin(global_invocation_id) gid: vec3) { let dst_idx = i0 * params.stride_dst0 + i1 * params.stride_dst1 + i2 * params.stride_dst2 + i3 * params.stride_dst3; - dst[params.dst_offset / 2 + dst_idx] = f16(src[params.src_offset / 4 + src_idx]); + dst[params.offset_dst / 2 + dst_idx] = f16(src[params.offset_src / 4 + src_idx]); } From cbf4b96924bc2f24ef37560b1561b69f91c855d7 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Sun, 6 Jul 2025 08:40:28 -0700 Subject: [PATCH 21/25] Address feedback --- docs/build.md | 2 +- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 13 +++++++------ ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl | 4 ++-- ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl | 3 ++- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/build.md b/docs/build.md index d15cb19b61d37..eae142dba003a 100644 --- a/docs/build.md +++ b/docs/build.md @@ -559,7 +559,7 @@ To read documentation for how to build on Android, [click here](./android.md) ## WebGPU [In Progress] -The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. +The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The currrent implementation is up-to-date with Dawn commit `bed1a61`. In the llama.cpp directory, build with CMake: diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index c072f279dcb63..ac792521054b3 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -264,8 +264,8 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ uint32_t * params = (uint32_t *) ctx->cpy_params_host_buf.GetMappedRange(); uint32_t ne = (uint32_t)ggml_nelements(node); params[0] = ne; - params[1] = src_misalignment; - params[2] = dst_misalignment; + params[1] = src_misalignment/ggml_type_size(src->type); + params[2] = dst_misalignment/ggml_type_size(node->type); // Convert byte-strides to element-strides params[3] = (uint32_t)src->nb[0]/ggml_type_size(src->type); @@ -881,10 +881,11 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() { ctx.name = GGML_WEBGPU_NAME; ctx.device_count = 1; - - wgpu::InstanceDescriptor instanceDescriptor{}; - instanceDescriptor.capabilities.timedWaitAnyEnable = true; - webgpu_ctx->instance = wgpu::CreateInstance(&instanceDescriptor); + wgpu::InstanceDescriptor instance_descriptor{}; + std::vector instance_features = {wgpu::InstanceFeatureName::TimedWaitAny}; + instance_descriptor.requiredFeatures = instance_features.data(); + instance_descriptor.requiredFeatureCount = instance_features.size(); + webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor); GGML_ASSERT(webgpu_ctx->instance != nullptr); static ggml_backend_reg reg = { diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl index 8c05312425c0b..5c2e2cfdd6d17 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl @@ -8,8 +8,8 @@ var dst: array; struct Params { ne: u32, // total number of elements - offset_src: u32, // in bytes - offset_dst: u32, // in bytes + offset_src: u32, // in elements + offset_dst: u32, // in elements // Strides (in elements) — may be permuted stride_src0: u32, diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl index 3b8b31474ca90..7a7a42f23d9ae 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl @@ -2,6 +2,7 @@ struct MulMatParams { m: u32, n: u32, k: u32, + // all strides are in elements stride_01: u32, stride_11: u32, stride_02: u32, @@ -16,7 +17,7 @@ struct MulMatParams { }; @group(0) @binding(0) var src0: array; // N rows, K columns -@group(0) @binding(1) var src1: array; // M rows, K columns +@group(0) @binding(1) var src1: array; // M rows, K columns (transposed) @group(0) @binding(2) var dst: array; // M rows, N columns @group(0) @binding(3) var params: MulMatParams; From e0d8a713bdb008cf02301e3e3b89737570380b4a Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Sun, 6 Jul 2025 11:04:10 -0700 Subject: [PATCH 22/25] Remove division by type size in cpy op --- ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl index 5c2e2cfdd6d17..6fe924c554cc3 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl @@ -56,5 +56,5 @@ fn main(@builtin(global_invocation_id) gid: vec3) { let dst_idx = i0 * params.stride_dst0 + i1 * params.stride_dst1 + i2 * params.stride_dst2 + i3 * params.stride_dst3; - dst[params.offset_dst / 2 + dst_idx] = f16(src[params.offset_src / 4 + src_idx]); + dst[params.offset_dst + dst_idx] = f16(src[params.offset_src + src_idx]); } From 40dd1f06b32bf6bb43ccc2f5faa007fc0391660e Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 15 Jul 2025 16:57:46 -0700 Subject: [PATCH 23/25] Fix formatting and add github action workflows for vulkan and metal (m-series) webgpu backends --- .github/workflows/build.yml | 130 ++++++++++++++++++ docs/build.md | 4 +- ggml/src/ggml-webgpu/CMakeLists.txt | 2 +- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 46 ++++--- .../ggml-webgpu/wgsl-shaders/embed_wgsl.py | 4 + .../src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl | 4 +- 6 files changed, 163 insertions(+), 27 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 42d63b7c5444c..506a1b12f4d28 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -135,6 +135,70 @@ jobs: cd build ctest -L main --verbose --timeout 900 + macOS-latest-cmake-arm64-webgpu: + runs-on: macos-14 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2.16 + with: + key: macOS-latest-cmake-arm64-webgpu + evict-old-files: 1d + + - name: Dependencies + id: depends + continue-on-error: true + run: | + brew update + brew install curl + + - name: Dawn Dependency + id: dawn-depends + run: | + ARTIFACTS_JSON=$(curl -s -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "https://api.github.com/repos/google/dawn/actions/artifacts") + echo "Finding latest macos-latest-Release artifact..." + DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts + | sort_by(.created_at) + | reverse + | map(select(.name | test("macos-latest-Release$"))) + | .[0].archive_download_url') + if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then + echo "No suitable Dawn artifact found!" + exit 1 + fi + echo "Downloading from: $DOWNLOAD_URL" + curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ + -o artifact.zip "$DOWNLOAD_URL" + unzip artifact.zip + mkdir dawn + tar_file=$(find . -name '*.tar.gz' | head -n 1) + echo "Extracting: $tar_file" + tar -xvf "$tar_file" -C dawn --strip-components=1 + + - name: Build + id: cmake_build + run: | + sysctl -a + export Dawn_DIR=dawn/lib64/cmake/Dawn + cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF + cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) + + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose --timeout 900 + ubuntu-cpu-cmake: strategy: matrix: @@ -344,6 +408,72 @@ jobs: # This is using llvmpipe and runs slower than other backends ctest -L main --verbose --timeout 3600 + ubuntu-22-cmake-webgpu: + runs-on: ubuntu-22.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2.16 + with: + key: ubuntu-22-cmake-webgpu + evict-old-files: 1d + + - name: Vulkan SDK Dependencies + id: vulkan-depends + run: | + wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - + sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list + sudo apt-get update -y + sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev + + - name: Dawn Dependency + id: dawn-depends + run: | + sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev + ARTIFACTS_JSON=$(curl -s -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "https://api.github.com/repos/google/dawn/actions/artifacts") + echo "Finding latest ubuntu-latest-Release artifact..." + DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts + | sort_by(.created_at) + | reverse + | map(select(.name | test("ubuntu-latest-Release$"))) + | .[0].archive_download_url') + if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then + echo "No suitable Dawn artifact found!" + exit 1 + fi + echo "Downloading from: $DOWNLOAD_URL" + curl -L \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ + -o artifact.zip "$DOWNLOAD_URL" + unzip artifact.zip + mkdir dawn + tar_file=$(find . -name '*.tar.gz' | head -n 1) + echo "Extracting: $tar_file" + tar -xvf "$tar_file" -C dawn --strip-components=1 + + - name: Build + id: cmake_build + run: | + export Dawn_DIR=dawn/lib64/cmake/Dawn + cmake -B build -DGGML_WEBGPU=ON + cmake --build build --config Release -j $(nproc) + + - name: Test + id: cmake_test + run: | + cd build + # This is using llvmpipe and runs slower than other backends + ctest -L main --verbose --timeout 3600 + ubuntu-22-cmake-hip: runs-on: ubuntu-22.04 container: rocm/dev-ubuntu-22.04:6.0.2 diff --git a/docs/build.md b/docs/build.md index eae142dba003a..70767ad91c056 100644 --- a/docs/build.md +++ b/docs/build.md @@ -568,9 +568,9 @@ cmake -B build -DGGML_WEBGPU=ON cmake --build build --config Release ``` -### Browser Support +### Browser Support -WebGPU allows cross-platform access to the GPU from supported browsers. We utilize [Emscripten](https://emscripten.org/) to compile ggml's WebGPU backend to WebAssembly. Emscripten does not officially support WebGPU bindings yet, but Dawn currently maintains its own WebGPU bindings called emdawnwebgpu. +WebGPU allows cross-platform access to the GPU from supported browsers. We utilize [Emscripten](https://emscripten.org/) to compile ggml's WebGPU backend to WebAssembly. Emscripten does not officially support WebGPU bindings yet, but Dawn currently maintains its own WebGPU bindings called emdawnwebgpu. Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/src/emdawnwebgpu/) to download or build the emdawnwebgpu package (Note that it might be safer to build the emdawbwebgpu package locally, so that it stays in sync with the version of Dawn you have installed above). When building using CMake, the path to the emdawnwebgpu port file needs to be set with the flag `EMDAWNWEBGPU_DIR`. diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt index 0d67f11ba5520..79ef68b85a477 100644 --- a/ggml/src/ggml-webgpu/CMakeLists.txt +++ b/ggml/src/ggml-webgpu/CMakeLists.txt @@ -51,4 +51,4 @@ if (GGML_WEBGPU_DEBUG) endif() target_include_directories(ggml-webgpu PRIVATE ${SHADER_OUTPUT_DIR}) -target_link_libraries(ggml-webgpu PRIVATE ${DawnWebGPU_TARGET}) \ No newline at end of file +target_link_libraries(ggml-webgpu PRIVATE ${DawnWebGPU_TARGET}) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index ac792521054b3..5366fd6a2ec34 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -7,7 +7,9 @@ #include "ggml-wgsl-shaders.hpp" +#include #include +#include #include #ifdef GGML_WEBGPU_DEBUG @@ -131,7 +133,7 @@ static void ggml_webgpu_create_buffer(wgpu::Device &device, wgpu::Buffer &buffer buffer_desc.size = size; buffer_desc.usage = usage; buffer_desc.label = label; - buffer_desc.mappedAtCreation = false; + buffer_desc.mappedAtCreation = false; // TODO: error handling buffer = device.CreateBuffer(&buffer_desc); } @@ -161,7 +163,7 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b uint32_t * params = (uint32_t *) ctx->memset_params_host_buf.GetMappedRange(); params[0] = (uint32_t)offset; - params[1] = (uint32_t)size; + params[1] = (uint32_t)size; params[2] = value; ctx->memset_params_host_buf.Unmap(); @@ -184,8 +186,8 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); encoder.CopyBufferToBuffer( - ctx->memset_params_host_buf, 0, - ctx->memset_params_dev_buf, 0, + ctx->memset_params_host_buf, 0, + ctx->memset_params_dev_buf, 0, ctx->memset_params_dev_buf.GetSize() ); wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); @@ -206,7 +208,7 @@ static void ggml_backend_webgpu_wait_on_submission(webgpu_context ctx) { if (status != wgpu::QueueWorkDoneStatus::Success) { GGML_LOG_ERROR("ggml_webgpu: Failed to wait on queue: %s\n", message.data); } - }), + }), UINT64_MAX ); } @@ -243,7 +245,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ case GGML_OP_VIEW: case GGML_OP_PERMUTE: return false; - + case GGML_OP_CPY: { std::lock_guard lock(ctx->mutex); const ggml_tensor * src = node->src[0]; @@ -259,7 +261,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); wgpu::Device device = ctx->device; - ggml_backend_webgpu_map_buffer(ctx, ctx->cpy_params_host_buf, + ggml_backend_webgpu_map_buffer(ctx, ctx->cpy_params_host_buf, wgpu::MapMode::Write, 0, ctx->cpy_params_host_buf.GetSize()); uint32_t * params = (uint32_t *) ctx->cpy_params_host_buf.GetMappedRange(); uint32_t ne = (uint32_t)ggml_nelements(node); @@ -309,8 +311,8 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); encoder.CopyBufferToBuffer( - ctx->cpy_params_host_buf, 0, - ctx->cpy_params_dev_buf, 0, + ctx->cpy_params_host_buf, 0, + ctx->cpy_params_dev_buf, 0, ctx->cpy_params_dev_buf.GetSize() ); wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); @@ -343,7 +345,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ wgpu::Device device = ctx->device; // map the host parameters buffer - ggml_backend_webgpu_map_buffer(ctx, ctx->mul_mat_params_host_buf, + ggml_backend_webgpu_map_buffer(ctx, ctx->mul_mat_params_host_buf, wgpu::MapMode::Write, 0, ctx->mul_mat_params_host_buf.GetSize()); uint32_t * params = (uint32_t *) ctx->mul_mat_params_host_buf.GetMappedRange(); @@ -371,7 +373,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ entries[0].offset = src0_offset; entries[0].size = ggml_nbytes(src0); - entries[1].binding = 1; + entries[1].binding = 1; entries[1].buffer = src1_ctx->buffer; entries[1].offset = src1_offset; entries[1].size = ggml_nbytes(src1); @@ -395,8 +397,8 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ wgpu::CommandEncoder encoder = device.CreateCommandEncoder(); encoder.CopyBufferToBuffer( - ctx->mul_mat_params_host_buf, 0, - ctx->mul_mat_params_dev_buf, 0, + ctx->mul_mat_params_host_buf, 0, + ctx->mul_mat_params_dev_buf, 0, ctx->mul_mat_params_dev_buf.GetSize() ); wgpu::ComputePassEncoder pass = encoder.BeginComputePass(); @@ -417,7 +419,7 @@ static bool ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node){ return false; } } - + static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { WEBGPU_LOG_DEBUG("ggml_backend_webgpu_graph_compute(" << cgraph->n_nodes << " nodes)"); @@ -517,13 +519,13 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer, std::lock_guard lock(webgpu_ctx->mutex); - if (webgpu_ctx->get_tensor_staging_buf == nullptr || + if (webgpu_ctx->get_tensor_staging_buf == nullptr || webgpu_ctx->get_tensor_staging_buf.GetSize() < final_size) { // Create a new staging buffer if it doesn't exist or is too small if (webgpu_ctx->get_tensor_staging_buf) { webgpu_ctx->get_tensor_staging_buf.Destroy(); } - ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, final_size, + ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, final_size, wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "get_tensor_staging_buf"); } @@ -577,7 +579,7 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b ggml_backend_webgpu_device_context * ctx = static_cast(buft->device->context); wgpu::Buffer buf; - ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf, size, + ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf, size, wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst, "allocated_buffer"); ggml_backend_webgpu_buffer_context * buf_ctx = new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf); @@ -652,7 +654,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context webgpu_ctx) { constants[1].key = "bytes_per_thread"; constants[1].value = webgpu_ctx->memset_bytes_per_thread; ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->memset_pipeline, wgsl_memset, "memset", constants); - ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_dev_buf, + ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_dev_buf, 3 * sizeof(uint32_t), // 3 parameters: buffer size, offset, value wgpu::BufferUsage::Uniform | wgpu::BufferUsage::CopyDst, "memset_params_dev_buf"); ggml_webgpu_create_buffer(webgpu_ctx->device, webgpu_ctx->memset_params_host_buf, @@ -679,7 +681,7 @@ static void ggml_webgpu_init_cpy_pipeline(webgpu_context webgpu_ctx) { wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc, "cpy_params_host_buf"); } -// TODO: Does this need to be thread safe? Is it only called once? +// TODO: Make thread safe if multiple devices are used static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) { GGML_UNUSED(params); @@ -696,7 +698,7 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co dev_desc.requiredLimits = &webgpu_ctx->limits; dev_desc.requiredFeatures = webgpu_ctx->features.features; dev_desc.requiredFeatureCount = webgpu_ctx->features.featureCount; - dev_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, + dev_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, [](const wgpu::Device& device, wgpu::DeviceLostReason reason, wgpu::StringView message) { GGML_UNUSED(device); GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast(reason), message.data); @@ -847,7 +849,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t device_ctx.device_name = std::string(info.device.data); device_ctx.device_desc = std::string(info.description.data); - GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n", + GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n", info.vendorID, info.vendor.data, info.architecture.data, info.deviceID, info.device.data, info.description.data); // See GGML Backend Device Interface section @@ -902,4 +904,4 @@ ggml_backend_t ggml_backend_webgpu_init(void) { return ggml_backend_webgpu_device_init(dev, nullptr); } -GGML_BACKEND_DL_IMPL(ggml_backend_webgpu_reg) \ No newline at end of file +GGML_BACKEND_DL_IMPL(ggml_backend_webgpu_reg) diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py index daec8fe87dfda..962dcd6b170ed 100755 --- a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +++ b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py @@ -1,13 +1,16 @@ import os import argparse + def escape_triple_quotes(wgsl): # Simple defense in case of embedded """ return wgsl.replace('"""', '\\"""') + def to_cpp_string_literal(varname, content): return f'const char* wgsl_{varname} = R"({content})";\n' + def main(): parser = argparse.ArgumentParser() parser.add_argument('--input', required=True) @@ -27,5 +30,6 @@ def main(): out.write(to_cpp_string_literal(varname, content)) out.write('\n') + if __name__ == '__main__': main() diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl index 7a7a42f23d9ae..054aab566f96b 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl @@ -41,7 +41,7 @@ fn main(@builtin(global_invocation_id) global_id: vec3) { let src02_idx = dst2_idx / params.broadcast2; // src0 may also be broadcast along the second dimension let src12_idx = dst2_idx; // src1 is not broadcast - let dst2_rem = dst3_rem % dst2_stride; + let dst2_rem = dst3_rem % dst2_stride; let row = dst2_rem / params.n; // output row let col = dst2_rem % params.n; // output column @@ -53,4 +53,4 @@ fn main(@builtin(global_invocation_id) global_id: vec3) { sum = sum + src0[src0_idx] * src1[src1_idx]; } dst[dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum; -} \ No newline at end of file +} From 8b31513f4b9ba3010c567e844da1d52657507309 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Tue, 15 Jul 2025 16:59:10 -0700 Subject: [PATCH 24/25] Fix name --- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 5366fd6a2ec34..c5abc69343357 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -846,7 +846,7 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t static ggml_backend_webgpu_device_context device_ctx; device_ctx.webgpu_ctx = ctx; - device_ctx.device_name = std::string(info.device.data); + device_ctx.device_name = GGML_WEBGPU_NAME; device_ctx.device_desc = std::string(info.description.data); GGML_LOG_INFO("ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | device_desc: %s\n", From 22f8dd4ea5b49373a9332173c53e1a8e65f3e9a4 Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Wed, 16 Jul 2025 07:44:05 -0700 Subject: [PATCH 25/25] Fix macos dawn prefix path --- .github/workflows/build.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 506a1b12f4d28..7de6eebe50d39 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -188,8 +188,7 @@ jobs: - name: Build id: cmake_build run: | - sysctl -a - export Dawn_DIR=dawn/lib64/cmake/Dawn + export CMAKE_PREFIX_PATH=dawn cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)