minor cleanups

reeselevine · reeselevine · commit 2eb7626dbbe4 · 2025-07-03T11:48:22.000-07:00
diff --git a/docs/build.md b/docs/build.md
@@ -572,7 +572,7 @@ cmake --build build --config Release
 
 WebGPU allows cross-platform access to the GPU from supported browsers. We utilize [Emscripten](https://emscripten.org/) to compile ggml's WebGPU backend to WebAssembly. Emscripten does not officially support WebGPU bindings yet, but Dawn currently maintains its own WebGPU bindings called emdawnwebgpu. 
 
-Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/src/emdawnwebgpu/) to download or build the emdawnwebgpu package (Note that it might be safer to build them locally, so that they stay in sync with the version of Dawn you have installed above). When building using CMake, the path to the emdawnwebgpu port file needs to be set with the flag `EMDAWNWEBGPU_DIR`.
+Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/src/emdawnwebgpu/) to download or build the emdawnwebgpu package (Note that it might be safer to build the emdawbwebgpu package locally, so that it stays in sync with the version of Dawn you have installed above). When building using CMake, the path to the emdawnwebgpu port file needs to be set with the flag `EMDAWNWEBGPU_DIR`.
 
 ## Notes about GPU-accelerated backends
 
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -18,9 +18,6 @@
 
 /* Constants */
 
-// TODO: find a better way to get the memory available
-#define WEBGPU_MAX_BUFFERS 32
-
 #define WEBGPU_MUL_MAT_WG_SIZE 64
 #define WEBGPU_MUL_MAT_PARAMS_SIZE (13 * sizeof(uint32_t)) // M, N, K, batch sizes, broadcasts
 #define WEBGPU_CPY_PARAMS_SIZE (15 * sizeof(uint32_t)) // strides and offsets
@@ -119,7 +116,7 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &device, wgpu::ComputePipel
     pipeline_desc.label = label;
     pipeline_desc.compute.module = shader_module;
     pipeline_desc.compute.entryPoint = "main"; // Entry point in the WGSL code
-    pipeline_desc.layout = nullptr; // Guessing that nullptr means auto layout
+    pipeline_desc.layout = nullptr; // nullptr means auto layout
     if (constants.size() > 0) {
         pipeline_desc.compute.constants = constants.data();
         pipeline_desc.compute.constantCount = constants.size();
@@ -199,7 +196,6 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context ctx, wgpu::Buffer b
     pass.End();
     wgpu::CommandBuffer commands = encoder.Finish();
 
-    // TODO, async, do we need to wait on this?
     ctx->queue.Submit(1, &commands);
 }
 
@@ -489,7 +485,6 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
 
     size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
 
-    // TODO: wait on this?
     webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size/4)*4);
 
     if (size % 4 != 0) {
@@ -617,9 +612,9 @@ static const char * ggml_backend_webgpu_device_get_description(ggml_backend_dev_
 
 static void ggml_backend_webgpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
     ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
-    // TODO: what do we actually want to return here?
-    *free = ctx->webgpu_ctx->limits.maxBufferSize * WEBGPU_MAX_BUFFERS;
-    *total = ctx->webgpu_ctx->limits.maxBufferSize * WEBGPU_MAX_BUFFERS;
+    // TODO: what do we actually want to return here? maxBufferSize might not be the full available memory.
+    *free = ctx->webgpu_ctx->limits.maxBufferSize;
+    *total = ctx->webgpu_ctx->limits.maxBufferSize;
 }
 
 static enum ggml_backend_dev_type ggml_backend_webgpu_device_get_type(ggml_backend_dev_t dev) {
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl
@@ -8,8 +8,8 @@ var<storage, read_write> dst: array<f16>;
 
 struct Params {
     ne: u32,            // total number of elements
-    src_offset: u32,    // in bytes
-    dst_offset: u32,    // in bytes
+    offset_src: u32,    // in bytes
+    offset_dst: u32,    // in bytes
 
     // Strides (in elements) — may be permuted
     stride_src0: u32,
@@ -56,5 +56,5 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
     let dst_idx = i0 * params.stride_dst0 + i1 * params.stride_dst1 +
                   i2 * params.stride_dst2 + i3 * params.stride_dst3;
 
-    dst[params.dst_offset / 2 + dst_idx] = f16(src[params.src_offset / 4 + src_idx]);
+    dst[params.offset_dst / 2 + dst_idx] = f16(src[params.offset_src / 4 + src_idx]);
 }