From 274d15100ac74a58e6f4da4dd360446b5504d43f Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Tue, 26 Oct 2021 20:30:16 -0700
Subject: [PATCH 1/8] First try at prefix sum

Sorta works but deadlocks on larger inputs.
---
 compute-shader-hello/src/main.rs     | 44 +++++++++----
 compute-shader-hello/src/shader.wgsl | 93 ++++++++++++++++++++++++++--
 2 files changed, 120 insertions(+), 17 deletions(-)
diff --git a/compute-shader-hello/src/main.rs b/compute-shader-hello/src/main.rs
index 7888e82..d8a91c7 100644
--- a/compute-shader-hello/src/main.rs
+++ b/compute-shader-hello/src/main.rs
@@ -22,6 +22,9 @@ use wgpu::util::DeviceExt;
 
 use bytemuck;
 
+const N_DATA: usize = 16384;
+const WG_SIZE: usize = 16;
+
 async fn run() {
     let instance = wgpu::Instance::new(wgpu::Backends::PRIMARY);
     let adapter = instance.request_adapter(&Default::default()).await.unwrap();
@@ -30,7 +33,7 @@ async fn run() {
         .request_device(
             &wgpu::DeviceDescriptor {
                 label: None,
-                features: features & wgpu::Features::TIMESTAMP_QUERY,
+                features: features & (wgpu::Features::TIMESTAMP_QUERY | wgpu::Features::CLEAR_COMMANDS),
                 limits: Default::default(),
             },
             None,
@@ -54,8 +57,8 @@ async fn run() {
         source: wgpu::ShaderSource::Wgsl(include_str!("shader.wgsl").into()),
     });
     println!("shader compilation {:?}", start_instant.elapsed());
-    let input_f = &[1.0f32, 2.0f32];
-    let input : &[u8] = bytemuck::bytes_of(input_f);
+    let input_f: Vec<u32> = (0..N_DATA as u32).collect();
+    let input: &[u8] = bytemuck::cast_slice(&input_f);
     let input_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
         label: None,
         contents: input,
@@ -69,6 +72,15 @@ async fn run() {
         usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
         mapped_at_creation: false,
     });
+    const N_WG: usize = N_DATA / WG_SIZE;
+    const STATE_SIZE: usize = N_WG * 3 + 1;
+    // TODO: round this up
+    let state_buf = device.create_buffer(&wgpu::BufferDescriptor {
+        label: None,
+        size: 4 * STATE_SIZE as u64,
+        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
+        mapped_at_creation: false,
+    });
     // This works if the buffer is initialized, otherwise reads all 0, for some reason.
     let query_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
         label: None,
@@ -87,21 +99,28 @@ async fn run() {
     let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
         label: None,
         layout: &bind_group_layout,
-        entries: &[wgpu::BindGroupEntry {
-            binding: 0,
-            resource: input_buf.as_entire_binding(),
-        }],
+        entries: &[
+            wgpu::BindGroupEntry {
+                binding: 0,
+                resource: input_buf.as_entire_binding(),
+            },
+            wgpu::BindGroupEntry {
+                binding: 1,
+                resource: state_buf.as_entire_binding(),
+            },
+        ],
     });
 
     let mut encoder = device.create_command_encoder(&Default::default());
     if let Some(query_set) = &query_set {
         encoder.write_timestamp(query_set, 0);
     }
+    encoder.clear_buffer(&state_buf, 0, None);
     {
         let mut cpass = encoder.begin_compute_pass(&Default::default());
         cpass.set_pipeline(&pipeline);
         cpass.set_bind_group(0, &bind_group, &[]);
-        cpass.dispatch(input_f.len() as u32, 1, 1);
+        cpass.dispatch(N_WG as u32, 1, 1);
     }
     if let Some(query_set) = &query_set {
         encoder.write_timestamp(query_set, 1);
@@ -121,14 +140,17 @@ async fn run() {
     println!("post-poll {:?}", std::time::Instant::now());
     if buf_future.await.is_ok() {
         let data_raw = &*buf_slice.get_mapped_range();
-        let data : &[f32] = bytemuck::cast_slice(data_raw);
+        let data: &[u32] = bytemuck::cast_slice(data_raw);
         println!("data: {:?}", &*data);
     }
     if features.contains(wgpu::Features::TIMESTAMP_QUERY) {
         let ts_period = queue.get_timestamp_period();
         let ts_data_raw = &*query_slice.get_mapped_range();
-        let ts_data : &[u64] = bytemuck::cast_slice(ts_data_raw);
-        println!("compute shader elapsed: {:?}ms", (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6);
+        let ts_data: &[u64] = bytemuck::cast_slice(ts_data_raw);
+        println!(
+            "compute shader elapsed: {:?}ms",
+            (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6
+        );
     }
 }
 
diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl
index 468c96e..8e06b4a 100644
--- a/compute-shader-hello/src/shader.wgsl
+++ b/compute-shader-hello/src/shader.wgsl
@@ -16,14 +16,95 @@
 
 [[block]]
 struct DataBuf {
-    data: [[stride(4)]] array<f32>;
+    data: [[stride(4)]] array<u32>;
+};
+
+[[block]]
+struct StateBuf {
+    state: [[stride(4)]] array<atomic<u32>>;
 };
 
 [[group(0), binding(0)]]
-var<storage, read_write> v_indices: DataBuf;
+var<storage, read_write> main_buf: DataBuf;
+
+[[group(0), binding(1)]]
+var<storage, read_write> state_buf: StateBuf;
+
+let FLAG_NOT_READY = 0u;
+let FLAG_AGGREGATE_READY = 1u;
+let FLAG_PREFIX_READY = 2u;
+
+let workgroup_size: u32 = 16u;
+
+var<workgroup> part_id: u32;
+var<workgroup> scratch: array<u32, workgroup_size>;
+var<workgroup> shared_prefix: u32;
+
+[[stage(compute), workgroup_size(16)]]
+fn main([[builtin(local_invocation_id)]] local_id: vec3<u32>) {
+    if (local_id.x == 0u) {
+        part_id = atomicAdd(&state_buf.state[0], 1u);
+    }
+    workgroupBarrier();
+    let my_part_id = part_id;
+    let mem_base = my_part_id * workgroup_size;
+    var el = main_buf.data[mem_base + local_id.x];
+    scratch[local_id.x] = el;
+    // This must be lg2(workgroup_size)
+    for (var i: u32 = 0u; i < 4u; i = i + 1u) {
+        workgroupBarrier();
+        if (local_id.x >= (1u << i)) {
+            el = el + scratch[local_id.x - (1u << i)];
+        }
+        workgroupBarrier();
+        scratch[local_id.x] = el;
+    }
+    var exclusive_prefix = 0u;
+
+    if (local_id.x == workgroup_size - 1u) {
+        var flag = FLAG_AGGREGATE_READY;
+        state_buf.state[my_part_id * 3u + 2u] = el;
+        if (my_part_id == 0u) {
+            state_buf.state[my_part_id * 3u + 3u] = el;
+            flag = FLAG_PREFIX_READY;
+        }
+        // TODO: these storage barriers should probably be in
+        // uniform control flow, but enforcing that is a pain.
+        storageBarrier();
+        state_buf.state[my_part_id * 3u + 1u] = flag;
+
+        if (my_part_id != 0u) {
+            // decoupled look-back
+            var look_back_ix = my_part_id - 1u;
+            loop {
+                flag = state_buf.state[look_back_ix * 3u + 1u];
+                storageBarrier();
+                if (flag == FLAG_PREFIX_READY) {
+                    let their_prefix = state_buf.state[look_back_ix * 3u + 3u];
+                    exclusive_prefix = their_prefix + exclusive_prefix;
+                    break;
+                } elseif (flag == FLAG_AGGREGATE_READY) {
+                    let their_agg = state_buf.state[look_back_ix * 3u + 2u];
+                    exclusive_prefix = their_agg + exclusive_prefix;
+                    look_back_ix = look_back_ix - 1u;
+                }
+                // else spin
+            }
+
+            // compute inclusive prefix
+            let inclusive_prefix = exclusive_prefix + el;
+            shared_prefix = exclusive_prefix;
+            state_buf.state[my_part_id * 3u + 3u] = inclusive_prefix;
+            storageBarrier();
+            state_buf.state[my_part_id * 3u + 1u] = FLAG_PREFIX_READY;
+        }
+    }
+    var prefix = 0u;
+    workgroupBarrier();
+    if (my_part_id != 0u) {
+        prefix = shared_prefix;
+    }
 
-[[stage(compute), workgroup_size(1)]]
-fn main([[builtin(global_invocation_id)]] global_id: vec3<u32>) {
-    // TODO: a more interesting computation than this.
-    v_indices.data[global_id.x] = v_indices.data[global_id.x] + 42.0;
+    // do final output
+    main_buf.data[mem_base + local_id.x] = prefix + el;
 }

From 2800b7391e6b46131576d8116c7c6ba091bc5767 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Tue, 26 Oct 2021 20:38:59 -0700
Subject: [PATCH 2/8] Make storage barriers uniform control flow

Still doesn't fix deadlocks tho :/
---
 compute-shader-hello/src/shader.wgsl | 48 ++++++++++++++++++----------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl
index 8e06b4a..b90ab30 100644
--- a/compute-shader-hello/src/shader.wgsl
+++ b/compute-shader-hello/src/shader.wgsl
@@ -39,6 +39,7 @@ let workgroup_size: u32 = 16u;
 var<workgroup> part_id: u32;
 var<workgroup> scratch: array<u32, workgroup_size>;
 var<workgroup> shared_prefix: u32;
+var<workgroup> shared_flag: u32;
 
 [[stage(compute), workgroup_size(16)]]
 fn main([[builtin(local_invocation_id)]] local_id: vec3<u32>) {
@@ -61,41 +62,54 @@ fn main([[builtin(local_invocation_id)]] local_id: vec3<u32>) {
     }
     var exclusive_prefix = 0u;
 
+    var flag = FLAG_AGGREGATE_READY;
     if (local_id.x == workgroup_size - 1u) {
-        var flag = FLAG_AGGREGATE_READY;
         state_buf.state[my_part_id * 3u + 2u] = el;
         if (my_part_id == 0u) {
             state_buf.state[my_part_id * 3u + 3u] = el;
             flag = FLAG_PREFIX_READY;
         }
-        // TODO: these storage barriers should probably be in
-        // uniform control flow, but enforcing that is a pain.
-        storageBarrier();
+    }
+    // make sure these barriers are in uniform control flow
+    storageBarrier();
+    if (local_id.x == workgroup_size - 1u) {
         state_buf.state[my_part_id * 3u + 1u] = flag;
+    }
 
-        if (my_part_id != 0u) {
-            // decoupled look-back
-            var look_back_ix = my_part_id - 1u;
-            loop {
-                flag = state_buf.state[look_back_ix * 3u + 1u];
-                storageBarrier();
-                if (flag == FLAG_PREFIX_READY) {
+    if (my_part_id != 0u) {
+        // decoupled look-back
+        var look_back_ix = my_part_id - 1u;
+        loop {
+            if (local_id.x == workgroup_size - 1u) {
+                shared_flag = state_buf.state[look_back_ix * 3u + 1u];
+            }
+            workgroupBarrier();
+            flag = shared_flag;
+            storageBarrier();
+            if (flag == FLAG_PREFIX_READY) {
+                if (local_id.x == workgroup_size - 1u) {
                     let their_prefix = state_buf.state[look_back_ix * 3u + 3u];
                     exclusive_prefix = their_prefix + exclusive_prefix;
-                    break;
-                } elseif (flag == FLAG_AGGREGATE_READY) {
+                }
+                break;
+            } elseif (flag == FLAG_AGGREGATE_READY) {
+                if (local_id.x == workgroup_size - 1u) {
                     let their_agg = state_buf.state[look_back_ix * 3u + 2u];
                     exclusive_prefix = their_agg + exclusive_prefix;
-                    look_back_ix = look_back_ix - 1u;
                 }
-                // else spin
+                look_back_ix = look_back_ix - 1u;
             }
+            // else spin
+        }
 
-            // compute inclusive prefix
+        // compute inclusive prefix
+        if (local_id.x == workgroup_size - 1u) {
             let inclusive_prefix = exclusive_prefix + el;
             shared_prefix = exclusive_prefix;
             state_buf.state[my_part_id * 3u + 3u] = inclusive_prefix;
-            storageBarrier();
+        }
+        storageBarrier();
+        if (local_id.x == workgroup_size - 1u) {
             state_buf.state[my_part_id * 3u + 1u] = FLAG_PREFIX_READY;
         }
     }

From 5fc557ce7d6b514db0e09c85568e88b17a1bfc4d Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Mon, 1 Nov 2021 11:41:39 -0700
Subject: [PATCH 3/8] Verify results

Still WIP
---
 compute-shader-hello/src/main.rs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/compute-shader-hello/src/main.rs b/compute-shader-hello/src/main.rs
index d8a91c7..ffc756d 100644
--- a/compute-shader-hello/src/main.rs
+++ b/compute-shader-hello/src/main.rs
@@ -22,9 +22,14 @@ use wgpu::util::DeviceExt;
 
 use bytemuck;
 
-const N_DATA: usize = 16384;
+const N_DATA: usize = 1024;
 const WG_SIZE: usize = 16;
 
+// Verify that the data is OEIS A000217
+fn verify(data: &[u32]) -> bool {
+    data.iter().enumerate().all(|(i, val)| (i * (i + 1)) / 2 == *val as usize)
+}
+
 async fn run() {
     let instance = wgpu::Instance::new(wgpu::Backends::PRIMARY);
     let adapter = instance.request_adapter(&Default::default()).await.unwrap();
@@ -141,7 +146,7 @@ async fn run() {
     if buf_future.await.is_ok() {
         let data_raw = &*buf_slice.get_mapped_range();
         let data: &[u32] = bytemuck::cast_slice(data_raw);
-        println!("data: {:?}", &*data);
+        println!("results correct: {}", verify(data));
     }
     if features.contains(wgpu::Features::TIMESTAMP_QUERY) {
         let ts_period = queue.get_timestamp_period();

From 17094ab6855275e18fc5043c6fa676d1377d0d0a Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Tue, 2 Nov 2021 14:52:34 -0700
Subject: [PATCH 4/8] Larger workgroup

Fastest results on AMD at workgroup = 1024. Note, this has atomicOr
workaround for correctness.

Also note, not all targets will support a workgroup of this size; on
shipping, we'd need to query and select at runtime.
---
 compute-shader-hello/src/main.rs     |  4 ++--
 compute-shader-hello/src/shader.wgsl | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/compute-shader-hello/src/main.rs b/compute-shader-hello/src/main.rs
index ffc756d..bb1fb70 100644
--- a/compute-shader-hello/src/main.rs
+++ b/compute-shader-hello/src/main.rs
@@ -22,8 +22,8 @@ use wgpu::util::DeviceExt;
 
 use bytemuck;
 
-const N_DATA: usize = 1024;
-const WG_SIZE: usize = 16;
+const N_DATA: usize = 65536;
+const WG_SIZE: usize = 1024;
 
 // Verify that the data is OEIS A000217
 fn verify(data: &[u32]) -> bool {
diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl
index b90ab30..7d2280d 100644
--- a/compute-shader-hello/src/shader.wgsl
+++ b/compute-shader-hello/src/shader.wgsl
@@ -34,14 +34,14 @@ let FLAG_NOT_READY = 0u;
 let FLAG_AGGREGATE_READY = 1u;
 let FLAG_PREFIX_READY = 2u;
 
-let workgroup_size: u32 = 16u;
+let workgroup_size: u32 = 1024u;
 
 var<workgroup> part_id: u32;
 var<workgroup> scratch: array<u32, workgroup_size>;
 var<workgroup> shared_prefix: u32;
 var<workgroup> shared_flag: u32;
 
-[[stage(compute), workgroup_size(16)]]
+[[stage(compute), workgroup_size(1024)]]
 fn main([[builtin(local_invocation_id)]] local_id: vec3<u32>) {
     if (local_id.x == 0u) {
         part_id = atomicAdd(&state_buf.state[0], 1u);
@@ -52,7 +52,7 @@ fn main([[builtin(local_invocation_id)]] local_id: vec3<u32>) {
     var el = main_buf.data[mem_base + local_id.x];
     scratch[local_id.x] = el;
     // This must be lg2(workgroup_size)
-    for (var i: u32 = 0u; i < 4u; i = i + 1u) {
+    for (var i: u32 = 0u; i < 10u; i = i + 1u) {
         workgroupBarrier();
         if (local_id.x >= (1u << i)) {
             el = el + scratch[local_id.x - (1u << i)];
@@ -81,20 +81,20 @@ fn main([[builtin(local_invocation_id)]] local_id: vec3<u32>) {
         var look_back_ix = my_part_id - 1u;
         loop {
             if (local_id.x == workgroup_size - 1u) {
-                shared_flag = state_buf.state[look_back_ix * 3u + 1u];
+                shared_flag = atomicOr(&state_buf.state[look_back_ix * 3u + 1u], 0u);
             }
             workgroupBarrier();
             flag = shared_flag;
             storageBarrier();
             if (flag == FLAG_PREFIX_READY) {
                 if (local_id.x == workgroup_size - 1u) {
-                    let their_prefix = state_buf.state[look_back_ix * 3u + 3u];
+                    let their_prefix = atomicOr(&state_buf.state[look_back_ix * 3u + 3u], 0u);
                     exclusive_prefix = their_prefix + exclusive_prefix;
                 }
                 break;
             } elseif (flag == FLAG_AGGREGATE_READY) {
                 if (local_id.x == workgroup_size - 1u) {
-                    let their_agg = state_buf.state[look_back_ix * 3u + 2u];
+                    let their_agg = atomicOr(&state_buf.state[look_back_ix * 3u + 2u], 0u);
                     exclusive_prefix = their_agg + exclusive_prefix;
                 }
                 look_back_ix = look_back_ix - 1u;

From 8174880f03e9e974a4211c05deb0cd7e347d819a Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Tue, 2 Nov 2021 16:02:17 -0700
Subject: [PATCH 5/8] Sequential section

Do a small sequential scan at the leaf of the hierarchy. That amortizes
both the workgroup-scope tree reduction and the (still sequential)
decoupled look-back to a larger number of inputs.

Note: this falls short of a real performance evaluation because there's
no attempt to warm up the GPU clock. But it's valid as a very rough
swag.
---
 compute-shader-hello/src/main.rs     | 17 +++++++++++------
 compute-shader-hello/src/shader.wgsl | 23 ++++++++++++++++++-----
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/compute-shader-hello/src/main.rs b/compute-shader-hello/src/main.rs
index bb1fb70..cf66f5a 100644
--- a/compute-shader-hello/src/main.rs
+++ b/compute-shader-hello/src/main.rs
@@ -22,12 +22,18 @@ use wgpu::util::DeviceExt;
 
 use bytemuck;
 
-const N_DATA: usize = 65536;
-const WG_SIZE: usize = 1024;
+const N_DATA: usize = 1 << 20;
+const WG_SIZE: usize = 1 << 10;
 
 // Verify that the data is OEIS A000217
-fn verify(data: &[u32]) -> bool {
-    data.iter().enumerate().all(|(i, val)| (i * (i + 1)) / 2 == *val as usize)
+fn verify(data: &[u32]) -> Option<usize> {
+    data.iter().enumerate().position(|(i, val)| {
+        let wrong = ((i * (i + 1)) / 2) as u32 != *val;
+        if wrong {
+            println!("diff @ {}: {} != {}", i, ((i * (i + 1)) / 2) as u32, *val);
+        }
+        wrong
+    })
 }
 
 async fn run() {
@@ -68,7 +74,6 @@ async fn run() {
         label: None,
         contents: input,
         usage: wgpu::BufferUsages::STORAGE
-            | wgpu::BufferUsages::COPY_DST
             | wgpu::BufferUsages::COPY_SRC,
     });
     let output_buf = device.create_buffer(&wgpu::BufferDescriptor {
@@ -146,7 +151,7 @@ async fn run() {
     if buf_future.await.is_ok() {
         let data_raw = &*buf_slice.get_mapped_range();
         let data: &[u32] = bytemuck::cast_slice(data_raw);
-        println!("results correct: {}", verify(data));
+        println!("results correct: {:?}", verify(data));
     }
     if features.contains(wgpu::Features::TIMESTAMP_QUERY) {
         let ts_period = queue.get_timestamp_period();
diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl
index 7d2280d..fad4212 100644
--- a/compute-shader-hello/src/shader.wgsl
+++ b/compute-shader-hello/src/shader.wgsl
@@ -34,14 +34,15 @@ let FLAG_NOT_READY = 0u;
 let FLAG_AGGREGATE_READY = 1u;
 let FLAG_PREFIX_READY = 2u;
 
-let workgroup_size: u32 = 1024u;
+let workgroup_size: u32 = 256u;
+let N_SEQ = 4u;
 
 var<workgroup> part_id: u32;
 var<workgroup> scratch: array<u32, workgroup_size>;
 var<workgroup> shared_prefix: u32;
 var<workgroup> shared_flag: u32;
 
-[[stage(compute), workgroup_size(1024)]]
+[[stage(compute), workgroup_size(256)]]
 fn main([[builtin(local_invocation_id)]] local_id: vec3<u32>) {
     if (local_id.x == 0u) {
         part_id = atomicAdd(&state_buf.state[0], 1u);
@@ -49,10 +50,16 @@ fn main([[builtin(local_invocation_id)]] local_id: vec3<u32>) {
     workgroupBarrier();
     let my_part_id = part_id;
     let mem_base = my_part_id * workgroup_size;
-    var el = main_buf.data[mem_base + local_id.x];
+    var local: array<u32, N_SEQ>;
+    var el = main_buf.data[(mem_base + local_id.x) * N_SEQ];
+    local[0] = el;
+    for (var i: u32 = 1u; i < N_SEQ; i = i + 1u) {
+        el = el + main_buf.data[(mem_base + local_id.x) * N_SEQ + i];
+        local[i] = el;
+    }
     scratch[local_id.x] = el;
     // This must be lg2(workgroup_size)
-    for (var i: u32 = 0u; i < 10u; i = i + 1u) {
+    for (var i: u32 = 0u; i < 8u; i = i + 1u) {
         workgroupBarrier();
         if (local_id.x >= (1u << i)) {
             el = el + scratch[local_id.x - (1u << i)];
@@ -120,5 +127,11 @@ fn main([[builtin(local_invocation_id)]] local_id: vec3<u32>) {
     }
 
     // do final output
-    main_buf.data[mem_base + local_id.x] = prefix + el;
+    for (var i: u32 = 0u; i < N_SEQ; i = i + 1u) {
+        var old = 0u;
+        if (local_id.x > 0u) {
+            old = scratch[local_id.x - 1u];
+        }
+        main_buf.data[(mem_base + local_id.x) * N_SEQ + i] = prefix + old + local[i];
+    }
 }

From 93ad8ee0b1d83bb377f66541f77942f2b3ec7da2 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Tue, 2 Nov 2021 16:47:29 -0700
Subject: [PATCH 6/8] Iterate runs

Better for performance analaysis
---
 compute-shader-hello/src/main.rs | 84 +++++++++++++++++---------------
 1 file changed, 44 insertions(+), 40 deletions(-)

diff --git a/compute-shader-hello/src/main.rs b/compute-shader-hello/src/main.rs
index cf66f5a..da4221e 100644
--- a/compute-shader-hello/src/main.rs
+++ b/compute-shader-hello/src/main.rs
@@ -22,7 +22,7 @@ use wgpu::util::DeviceExt;
 
 use bytemuck;
 
-const N_DATA: usize = 1 << 20;
+const N_DATA: usize = 1 << 25;
 const WG_SIZE: usize = 1 << 10;
 
 // Verify that the data is OEIS A000217
@@ -121,46 +121,50 @@ async fn run() {
         ],
     });
 
-    let mut encoder = device.create_command_encoder(&Default::default());
-    if let Some(query_set) = &query_set {
-        encoder.write_timestamp(query_set, 0);
-    }
-    encoder.clear_buffer(&state_buf, 0, None);
-    {
-        let mut cpass = encoder.begin_compute_pass(&Default::default());
-        cpass.set_pipeline(&pipeline);
-        cpass.set_bind_group(0, &bind_group, &[]);
-        cpass.dispatch(N_WG as u32, 1, 1);
-    }
-    if let Some(query_set) = &query_set {
-        encoder.write_timestamp(query_set, 1);
-    }
-    encoder.copy_buffer_to_buffer(&input_buf, 0, &output_buf, 0, input.len() as u64);
-    if let Some(query_set) = &query_set {
-        encoder.resolve_query_set(query_set, 0..2, &query_buf, 0);
-    }
-    queue.submit(Some(encoder.finish()));
+    for i in 0..10 {
+        let mut encoder = device.create_command_encoder(&Default::default());
+        if let Some(query_set) = &query_set {
+            encoder.write_timestamp(query_set, 0);
+        }
+        encoder.clear_buffer(&state_buf, 0, None);
+        {
+            let mut cpass = encoder.begin_compute_pass(&Default::default());
+            cpass.set_pipeline(&pipeline);
+            cpass.set_bind_group(0, &bind_group, &[]);
+            cpass.dispatch(N_WG as u32, 1, 1);
+        }
+        if let Some(query_set) = &query_set {
+            encoder.write_timestamp(query_set, 1);
+        }
+        encoder.copy_buffer_to_buffer(&input_buf, 0, &output_buf, 0, input.len() as u64);
+        if let Some(query_set) = &query_set {
+            encoder.resolve_query_set(query_set, 0..2, &query_buf, 0);
+        }
+        queue.submit(Some(encoder.finish()));
 
-    let buf_slice = output_buf.slice(..);
-    let buf_future = buf_slice.map_async(wgpu::MapMode::Read);
-    let query_slice = query_buf.slice(..);
-    let _query_future = query_slice.map_async(wgpu::MapMode::Read);
-    println!("pre-poll {:?}", std::time::Instant::now());
-    device.poll(wgpu::Maintain::Wait);
-    println!("post-poll {:?}", std::time::Instant::now());
-    if buf_future.await.is_ok() {
-        let data_raw = &*buf_slice.get_mapped_range();
-        let data: &[u32] = bytemuck::cast_slice(data_raw);
-        println!("results correct: {:?}", verify(data));
-    }
-    if features.contains(wgpu::Features::TIMESTAMP_QUERY) {
-        let ts_period = queue.get_timestamp_period();
-        let ts_data_raw = &*query_slice.get_mapped_range();
-        let ts_data: &[u64] = bytemuck::cast_slice(ts_data_raw);
-        println!(
-            "compute shader elapsed: {:?}ms",
-            (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6
-        );
+        let buf_slice = output_buf.slice(..);
+        let buf_future = buf_slice.map_async(wgpu::MapMode::Read);
+        let query_slice = query_buf.slice(..);
+        let _query_future = query_slice.map_async(wgpu::MapMode::Read);
+        device.poll(wgpu::Maintain::Wait);
+        if buf_future.await.is_ok() {
+            let data_raw = &*buf_slice.get_mapped_range();
+            let data: &[u32] = bytemuck::cast_slice(data_raw);
+            if i == 0 {
+                println!("results correct: {:?}", verify(data));
+            }
+        }
+        if features.contains(wgpu::Features::TIMESTAMP_QUERY) {
+            let ts_period = queue.get_timestamp_period();
+            let ts_data_raw = &*query_slice.get_mapped_range();
+            let ts_data: &[u64] = bytemuck::cast_slice(ts_data_raw);
+            println!(
+                "compute shader elapsed: {:?}ms",
+                (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6
+            );
+        }
+        output_buf.unmap();
+        query_buf.unmap();
     }
 }
 

From 697ea4ecf10c2a36b98fa6d101473f2a7bfbf495 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Tue, 2 Nov 2021 17:25:44 -0700
Subject: [PATCH 7/8] Go fast

Performance measurement requires keeping the GPU busy. That means not
copying results back to CPU and doing verification there.
---
 compute-shader-hello/src/main.rs     | 34 ++++++++++++++++------------
 compute-shader-hello/src/shader.wgsl |  8 +++----
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/compute-shader-hello/src/main.rs b/compute-shader-hello/src/main.rs
index da4221e..2334db4 100644
--- a/compute-shader-hello/src/main.rs
+++ b/compute-shader-hello/src/main.rs
@@ -23,7 +23,7 @@ use wgpu::util::DeviceExt;
 use bytemuck;
 
 const N_DATA: usize = 1 << 25;
-const WG_SIZE: usize = 1 << 10;
+const WG_SIZE: usize = 1 << 12;
 
 // Verify that the data is OEIS A000217
 fn verify(data: &[u32]) -> Option<usize> {
@@ -121,7 +121,7 @@ async fn run() {
         ],
     });
 
-    for i in 0..10 {
+    for i in 0..100 {
         let mut encoder = device.create_command_encoder(&Default::default());
         if let Some(query_set) = &query_set {
             encoder.write_timestamp(query_set, 0);
@@ -136,7 +136,9 @@ async fn run() {
         if let Some(query_set) = &query_set {
             encoder.write_timestamp(query_set, 1);
         }
-        encoder.copy_buffer_to_buffer(&input_buf, 0, &output_buf, 0, input.len() as u64);
+        if i == 0 {
+            encoder.copy_buffer_to_buffer(&input_buf, 0, &output_buf, 0, input.len() as u64);
+        }
         if let Some(query_set) = &query_set {
             encoder.resolve_query_set(query_set, 0..2, &query_buf, 0);
         }
@@ -145,25 +147,27 @@ async fn run() {
         let buf_slice = output_buf.slice(..);
         let buf_future = buf_slice.map_async(wgpu::MapMode::Read);
         let query_slice = query_buf.slice(..);
-        let _query_future = query_slice.map_async(wgpu::MapMode::Read);
+        let query_future = query_slice.map_async(wgpu::MapMode::Read);
         device.poll(wgpu::Maintain::Wait);
         if buf_future.await.is_ok() {
-            let data_raw = &*buf_slice.get_mapped_range();
-            let data: &[u32] = bytemuck::cast_slice(data_raw);
             if i == 0 {
+                let data_raw = &*buf_slice.get_mapped_range();
+                let data: &[u32] = bytemuck::cast_slice(data_raw);
                 println!("results correct: {:?}", verify(data));
             }
+            output_buf.unmap();
         }
-        if features.contains(wgpu::Features::TIMESTAMP_QUERY) {
-            let ts_period = queue.get_timestamp_period();
-            let ts_data_raw = &*query_slice.get_mapped_range();
-            let ts_data: &[u64] = bytemuck::cast_slice(ts_data_raw);
-            println!(
-                "compute shader elapsed: {:?}ms",
-                (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6
-            );
+        if query_future.await.is_ok() {
+            if features.contains(wgpu::Features::TIMESTAMP_QUERY) {
+                let ts_period = queue.get_timestamp_period();
+                let ts_data_raw = &*query_slice.get_mapped_range();
+                let ts_data: &[u64] = bytemuck::cast_slice(ts_data_raw);
+                println!(
+                    "compute shader elapsed: {:?}ms",
+                    (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6
+                );
+            }
         }
-        output_buf.unmap();
         query_buf.unmap();
     }
 }
diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl
index fad4212..25fbc98 100644
--- a/compute-shader-hello/src/shader.wgsl
+++ b/compute-shader-hello/src/shader.wgsl
@@ -34,15 +34,15 @@ let FLAG_NOT_READY = 0u;
 let FLAG_AGGREGATE_READY = 1u;
 let FLAG_PREFIX_READY = 2u;
 
-let workgroup_size: u32 = 256u;
-let N_SEQ = 4u;
+let workgroup_size: u32 = 512u;
+let N_SEQ = 8u;
 
 var<workgroup> part_id: u32;
 var<workgroup> scratch: array<u32, workgroup_size>;
 var<workgroup> shared_prefix: u32;
 var<workgroup> shared_flag: u32;
 
-[[stage(compute), workgroup_size(256)]]
+[[stage(compute), workgroup_size(512)]]
 fn main([[builtin(local_invocation_id)]] local_id: vec3<u32>) {
     if (local_id.x == 0u) {
         part_id = atomicAdd(&state_buf.state[0], 1u);
@@ -59,7 +59,7 @@ fn main([[builtin(local_invocation_id)]] local_id: vec3<u32>) {
     }
     scratch[local_id.x] = el;
     // This must be lg2(workgroup_size)
-    for (var i: u32 = 0u; i < 8u; i = i + 1u) {
+    for (var i: u32 = 0u; i < 9u; i = i + 1u) {
         workgroupBarrier();
         if (local_id.x >= (1u << i)) {
             el = el + scratch[local_id.x - (1u << i)];

From 87e5b201a119210649077357214fc88fed552b72 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Wed, 3 Nov 2021 17:19:47 -0700
Subject: [PATCH 8/8] Use explicit atomic stores

Naga will accept ordinary loads and stores to atomic types, but tint
will not.
---
 compute-shader-hello/src/shader.wgsl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl
index 25fbc98..233586d 100644
--- a/compute-shader-hello/src/shader.wgsl
+++ b/compute-shader-hello/src/shader.wgsl
@@ -71,16 +71,16 @@ fn main([[builtin(local_invocation_id)]] local_id: vec3<u32>) {
 
     var flag = FLAG_AGGREGATE_READY;
     if (local_id.x == workgroup_size - 1u) {
-        state_buf.state[my_part_id * 3u + 2u] = el;
+        atomicStore(&state_buf.state[my_part_id * 3u + 2u], el);
         if (my_part_id == 0u) {
-            state_buf.state[my_part_id * 3u + 3u] = el;
+            atomicStore(&state_buf.state[my_part_id * 3u + 3u], el);
             flag = FLAG_PREFIX_READY;
         }
     }
     // make sure these barriers are in uniform control flow
     storageBarrier();
     if (local_id.x == workgroup_size - 1u) {
-        state_buf.state[my_part_id * 3u + 1u] = flag;
+        atomicStore(&state_buf.state[my_part_id * 3u + 1u], flag);
     }
 
     if (my_part_id != 0u) {
@@ -113,11 +113,11 @@ fn main([[builtin(local_invocation_id)]] local_id: vec3<u32>) {
         if (local_id.x == workgroup_size - 1u) {
             let inclusive_prefix = exclusive_prefix + el;
             shared_prefix = exclusive_prefix;
-            state_buf.state[my_part_id * 3u + 3u] = inclusive_prefix;
+            atomicStore(&state_buf.state[my_part_id * 3u + 3u], inclusive_prefix);
         }
         storageBarrier();
         if (local_id.x == workgroup_size - 1u) {
-            state_buf.state[my_part_id * 3u + 1u] = FLAG_PREFIX_READY;
+            atomicStore(&state_buf.state[my_part_id * 3u + 1u], FLAG_PREFIX_READY);
         }
     }
     var prefix = 0u;