From 0594d07cfa847c5cf202dda1cac8d81c6fdbc00b Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Tue, 26 Oct 2021 20:30:16 -0700
Subject: [PATCH 1/5] Atomic coherency test

This overwrites the compute-shader-hello example to be a test of atomic
coherency. My understanding is that with the barriers it should run with
0 failures, even in strategy 0. With strategy 1 (atomicOr) as a
workaround, it seems to be working.
---
 compute-shader-hello/Cargo.lock      |  18 +++++
 compute-shader-hello/Cargo.toml      |   2 +-
 compute-shader-hello/src/main.rs     | 117 +++++++++++++++++++--------
 compute-shader-hello/src/shader.wgsl |  54 +++++++++++--
 4 files changed, 153 insertions(+), 38 deletions(-)

diff --git a/compute-shader-hello/Cargo.lock b/compute-shader-hello/Cargo.lock
index b9e26f0..db28b50 100644
--- a/compute-shader-hello/Cargo.lock
+++ b/compute-shader-hello/Cargo.lock
@@ -204,6 +204,12 @@ dependencies = [
  "termcolor",
 ]
 
+[[package]]
+name = "fixedbitset"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "398ea4fabe40b9b0d885340a2a991a44c8a645624075ad966d21f88688e2b69e"
+
 [[package]]
 name = "foreign-types"
 version = "0.3.2"
@@ -445,6 +451,7 @@ dependencies = [
  "indexmap",
  "log",
  "num-traits",
+ "petgraph",
  "spirv",
  "thiserror",
 ]
@@ -502,6 +509,16 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "petgraph"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f"
+dependencies = [
+ "fixedbitset",
+ "indexmap",
+]
+
 [[package]]
 name = "pollster"
 version = "0.2.4"
@@ -753,6 +770,7 @@ dependencies = [
  "arrayvec",
  "js-sys",
  "log",
+ "naga",
  "parking_lot",
  "raw-window-handle",
  "smallvec",
diff --git a/compute-shader-hello/Cargo.toml b/compute-shader-hello/Cargo.toml
index 2916859..95869b3 100644
--- a/compute-shader-hello/Cargo.toml
+++ b/compute-shader-hello/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2018"
 resolver = "2"
 
 [dependencies]
-wgpu = "0.11.0"
+wgpu = { version = "0.11.0", features = ["spirv"] }
 env_logger = "0.8"
 pollster = "0.2"
 bytemuck = { version = "1.7", features = ["derive"] }
diff --git a/compute-shader-hello/src/main.rs b/compute-shader-hello/src/main.rs
index 7888e82..d5ed56d 100644
--- a/compute-shader-hello/src/main.rs
+++ b/compute-shader-hello/src/main.rs
@@ -22,15 +22,25 @@ use wgpu::util::DeviceExt;
 
 use bytemuck;
 
+// A strategy of 0 is just atomic loads.
+// A strategy of 1 replaces the flag load with an atomicOr.
+const STRATEGY: u32 = 0;
+
+const USE_SPIRV: bool = false;
+
 async fn run() {
     let instance = wgpu::Instance::new(wgpu::Backends::PRIMARY);
     let adapter = instance.request_adapter(&Default::default()).await.unwrap();
     let features = adapter.features();
+    let mut feature_mask = wgpu::Features::TIMESTAMP_QUERY | wgpu::Features::CLEAR_COMMANDS;
+    if USE_SPIRV {
+        feature_mask |= wgpu::Features::SPIRV_SHADER_PASSTHROUGH;
+    }
     let (device, queue) = adapter
         .request_device(
             &wgpu::DeviceDescriptor {
                 label: None,
-                features: features & wgpu::Features::TIMESTAMP_QUERY,
+                features: features & feature_mask,
                 limits: Default::default(),
             },
             None,
@@ -48,26 +58,33 @@ async fn run() {
     };
 
     let start_instant = Instant::now();
-    let cs_module = device.create_shader_module(&wgpu::ShaderModuleDescriptor {
-        label: None,
-        //source: wgpu::ShaderSource::SpirV(bytes_to_u32(include_bytes!("alu.spv")).into()),
-        source: wgpu::ShaderSource::Wgsl(include_str!("shader.wgsl").into()),
-    });
+    let cs_module = if USE_SPIRV {
+        let shader_src: &[u32] = bytemuck::cast_slice(include_bytes!("shader.spv"));
+        unsafe {
+            device.create_shader_module_spirv(&wgpu::ShaderModuleDescriptorSpirV {
+                label: None,
+                source: std::borrow::Cow::Owned(shader_src.into()),
+           })
+        }
+    } else {
+        device.create_shader_module(&wgpu::ShaderModuleDescriptor {
+            label: None,
+            source: wgpu::ShaderSource::Wgsl(include_str!("shader.wgsl").into()),
+        })
+    };
+    
+
     println!("shader compilation {:?}", start_instant.elapsed());
-    let input_f = &[1.0f32, 2.0f32];
-    let input : &[u8] = bytemuck::bytes_of(input_f);
-    let input_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+    let data_buf = device.create_buffer(&wgpu::BufferDescriptor {
         label: None,
-        contents: input,
-        usage: wgpu::BufferUsages::STORAGE
-            | wgpu::BufferUsages::COPY_DST
-            | wgpu::BufferUsages::COPY_SRC,
+        size: 0x80000,
+        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
+        mapped_at_creation: false,
     });
-    let output_buf = device.create_buffer(&wgpu::BufferDescriptor {
+    let config_buf =  device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
         label: None,
-        size: input.len() as u64,
-        usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
-        mapped_at_creation: false,
+        contents: bytemuck::bytes_of(&[STRATEGY, 0]),
+        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::MAP_READ,
     });
     // This works if the buffer is initialized, otherwise reads all 0, for some reason.
     let query_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
@@ -76,62 +93,98 @@ async fn run() {
         usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
     });
 
+    let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+        label: None,
+        entries: &[wgpu::BindGroupLayoutEntry {
+            binding: 0,
+            visibility: wgpu::ShaderStages::COMPUTE,
+            ty: wgpu::BindingType::Buffer {
+                ty: wgpu::BufferBindingType::Storage { read_only: false },
+                has_dynamic_offset: false,
+                min_binding_size: None,
+            },
+            count: None,
+        },
+        wgpu::BindGroupLayoutEntry {
+            binding: 1,
+            visibility: wgpu::ShaderStages::COMPUTE,
+            ty: wgpu::BindingType::Buffer {
+                ty: wgpu::BufferBindingType::Storage { read_only: false },
+                has_dynamic_offset: false,
+                min_binding_size: None,
+            },
+            count: None,
+        }],
+    });
+    let compute_pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+        label: None,
+        bind_group_layouts: &[&bind_group_layout],
+        push_constant_ranges: &[],
+    });
     let pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
         label: None,
-        layout: None,
+        layout: Some(&compute_pipeline_layout),
         module: &cs_module,
         entry_point: "main",
     });
 
-    let bind_group_layout = pipeline.get_bind_group_layout(0);
     let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
         label: None,
         layout: &bind_group_layout,
-        entries: &[wgpu::BindGroupEntry {
-            binding: 0,
-            resource: input_buf.as_entire_binding(),
-        }],
+        entries: &[
+            wgpu::BindGroupEntry {
+                binding: 0,
+                resource: data_buf.as_entire_binding(),
+            },
+            wgpu::BindGroupEntry {
+                binding: 1,
+                resource: config_buf.as_entire_binding(),
+            },
+        ],
     });
 
     let mut encoder = device.create_command_encoder(&Default::default());
     if let Some(query_set) = &query_set {
         encoder.write_timestamp(query_set, 0);
     }
+    encoder.clear_buffer(&data_buf, 0, None);
     {
         let mut cpass = encoder.begin_compute_pass(&Default::default());
         cpass.set_pipeline(&pipeline);
         cpass.set_bind_group(0, &bind_group, &[]);
-        cpass.dispatch(input_f.len() as u32, 1, 1);
+        cpass.dispatch(256, 1, 1);
     }
     if let Some(query_set) = &query_set {
         encoder.write_timestamp(query_set, 1);
     }
-    encoder.copy_buffer_to_buffer(&input_buf, 0, &output_buf, 0, input.len() as u64);
+    //encoder.copy_buffer_to_buffer(&input_buf, 0, &output_buf, 0, input.len() as u64);
     if let Some(query_set) = &query_set {
         encoder.resolve_query_set(query_set, 0..2, &query_buf, 0);
     }
     queue.submit(Some(encoder.finish()));
 
-    let buf_slice = output_buf.slice(..);
+    let buf_slice = config_buf.slice(..);
     let buf_future = buf_slice.map_async(wgpu::MapMode::Read);
     let query_slice = query_buf.slice(..);
     let _query_future = query_slice.map_async(wgpu::MapMode::Read);
-    println!("pre-poll {:?}", std::time::Instant::now());
     device.poll(wgpu::Maintain::Wait);
-    println!("post-poll {:?}", std::time::Instant::now());
     if buf_future.await.is_ok() {
         let data_raw = &*buf_slice.get_mapped_range();
-        let data : &[f32] = bytemuck::cast_slice(data_raw);
-        println!("data: {:?}", &*data);
+        let data: &[u32] = bytemuck::cast_slice(data_raw);
+        println!("failures with strategy {}: {}", data[0], data[1]);
     }
     if features.contains(wgpu::Features::TIMESTAMP_QUERY) {
         let ts_period = queue.get_timestamp_period();
         let ts_data_raw = &*query_slice.get_mapped_range();
-        let ts_data : &[u64] = bytemuck::cast_slice(ts_data_raw);
-        println!("compute shader elapsed: {:?}ms", (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6);
+        let ts_data: &[u64] = bytemuck::cast_slice(ts_data_raw);
+        println!(
+            "compute shader elapsed: {:?}ms",
+            (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6
+        );
     }
 }
 
 fn main() {
+    env_logger::init();
     pollster::block_on(run());
 }
diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl
index 468c96e..4abcfb6 100644
--- a/compute-shader-hello/src/shader.wgsl
+++ b/compute-shader-hello/src/shader.wgsl
@@ -16,14 +16,58 @@
 
 [[block]]
 struct DataBuf {
-    data: [[stride(4)]] array<f32>;
+    data: [[stride(4)]] array<atomic<u32>>;
+};
+
+[[block]]
+struct ControlBuf {
+    strategy: u32;
+    failures: atomic<u32>;
 };
 
 [[group(0), binding(0)]]
-var<storage, read_write> v_indices: DataBuf;
+var<storage, read_write> data_buf: DataBuf;
+
+[[group(0), binding(1)]]
+var<storage, read_write> control_buf: ControlBuf;
 
-[[stage(compute), workgroup_size(1)]]
+// Put the flag in quite a different place than the data, which
+// should increase the number of failures, as they likely won't
+// be on the same cache line.
+fn permute_flag_ix(data_ix: u32) -> u32 {
+    return (data_ix * 31u) & 0xffffu;
+}
+
+[[stage(compute), workgroup_size(256)]]
 fn main([[builtin(global_invocation_id)]] global_id: vec3<u32>) {
-    // TODO: a more interesting computation than this.
-    v_indices.data[global_id.x] = v_indices.data[global_id.x] + 42.0;
+    let ix = global_id.x;
+    // Originally this was passed in, but is now hardcoded, as D3DCompiler
+    // thinks control flow becomes nonuniform if it's read from input.
+    let n_iter = 1024u;
+    let strategy = control_buf.strategy;
+    var failures = 0u;
+    for (var i: u32 = 0u; i < n_iter; i = i + 1u) {
+        let wr_flag_ix = permute_flag_ix(ix);
+        atomicStore(&data_buf.data[ix * 2u], i + 1u);
+        storageBarrier(); // release semantics for writing flag
+        atomicStore(&data_buf.data[wr_flag_ix * 2u + 1u], i + 1u);
+
+        // Read from a different workgroup
+        let read_ix = ((ix & 0xffu) << 8u) | (ix >> 8u);
+        let read_flag_ix = permute_flag_ix(read_ix);
+
+        let flag = atomicLoad(&data_buf.data[read_flag_ix * 2u + 1u]);
+        //let flag = atomicOr(&data_buf.data[read_flag_ix * 2u + 1u], 0u);
+        storageBarrier(); // acquire semantics for reading flag
+        var data = 0u;
+        if (strategy == 0u) {
+            data = atomicLoad(&data_buf.data[read_ix * 2u]);
+        } else {
+            data = atomicOr(&data_buf.data[read_ix * 2u], 0u);
+        }
+        if (flag > data) {
+            failures = failures + 1u;
+        }
+    }
+    let unused = atomicAdd(&control_buf.failures, failures);
 }

From fc7dc5dbbc459cbc6b55d06f31b661f2e6214c77 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Fri, 29 Oct 2021 09:31:49 -0700
Subject: [PATCH 2/5] Use mixed atomics / non-atomics

This is probably the version of the test I want to promote.
---
 compute-shader-hello/src/shader.wgsl | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl
index 4abcfb6..85f1511 100644
--- a/compute-shader-hello/src/shader.wgsl
+++ b/compute-shader-hello/src/shader.wgsl
@@ -14,9 +14,14 @@
 //
 // Also licensed under MIT license, at your choice.
 
+struct Element {
+    data: u32;
+    flag: atomic<u32>;
+};
+
 [[block]]
 struct DataBuf {
-    data: [[stride(4)]] array<atomic<u32>>;
+    data: [[stride(8)]] array<Element>;
 };
 
 [[block]]
@@ -48,23 +53,17 @@ fn main([[builtin(global_invocation_id)]] global_id: vec3<u32>) {
     var failures = 0u;
     for (var i: u32 = 0u; i < n_iter; i = i + 1u) {
         let wr_flag_ix = permute_flag_ix(ix);
-        atomicStore(&data_buf.data[ix * 2u], i + 1u);
+        data_buf.data[ix].data = i + 1u;
         storageBarrier(); // release semantics for writing flag
-        atomicStore(&data_buf.data[wr_flag_ix * 2u + 1u], i + 1u);
+        atomicStore(&data_buf.data[wr_flag_ix].flag, i + 1u);
 
         // Read from a different workgroup
         let read_ix = ((ix & 0xffu) << 8u) | (ix >> 8u);
         let read_flag_ix = permute_flag_ix(read_ix);
 
-        let flag = atomicLoad(&data_buf.data[read_flag_ix * 2u + 1u]);
-        //let flag = atomicOr(&data_buf.data[read_flag_ix * 2u + 1u], 0u);
+        let flag = atomicLoad(&data_buf.data[read_flag_ix].flag);
         storageBarrier(); // acquire semantics for reading flag
-        var data = 0u;
-        if (strategy == 0u) {
-            data = atomicLoad(&data_buf.data[read_ix * 2u]);
-        } else {
-            data = atomicOr(&data_buf.data[read_ix * 2u], 0u);
-        }
+        let data = data_buf.data[read_ix].data;
         if (flag > data) {
             failures = failures + 1u;
         }

From cd4c4bbd51e5e080e2b04dfd8f0b9d090e085d16 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Fri, 29 Oct 2021 22:12:13 -0700
Subject: [PATCH 3/5] Simplified test

The test is both simpler and more reliable in catching problems. There's
no looping in the shader, so it's basically just the classic "message
passing" litmus test, just run in parallel and with a simple permutation
(multiplication by primes mod 64k) to mix up memory acccesses.

It also runs 1000 dispatches. This reliably catches thousands of
failures on my 5700XT.
---
 compute-shader-hello/src/main.rs     | 134 ++++++++++++---------------
 compute-shader-hello/src/shader.wgsl |  35 +++----
 2 files changed, 73 insertions(+), 96 deletions(-)

diff --git a/compute-shader-hello/src/main.rs b/compute-shader-hello/src/main.rs
index d5ed56d..59831fa 100644
--- a/compute-shader-hello/src/main.rs
+++ b/compute-shader-hello/src/main.rs
@@ -16,16 +16,8 @@
 
 //! A simple application to run a compute shader.
 
-use std::time::Instant;
-
-use wgpu::util::DeviceExt;
-
 use bytemuck;
 
-// A strategy of 0 is just atomic loads.
-// A strategy of 1 replaces the flag load with an atomicOr.
-const STRATEGY: u32 = 0;
-
 const USE_SPIRV: bool = false;
 
 async fn run() {
@@ -57,14 +49,13 @@ async fn run() {
         None
     };
 
-    let start_instant = Instant::now();
     let cs_module = if USE_SPIRV {
         let shader_src: &[u32] = bytemuck::cast_slice(include_bytes!("shader.spv"));
         unsafe {
             device.create_shader_module_spirv(&wgpu::ShaderModuleDescriptorSpirV {
                 label: None,
                 source: std::borrow::Cow::Owned(shader_src.into()),
-           })
+            })
         }
     } else {
         device.create_shader_module(&wgpu::ShaderModuleDescriptor {
@@ -72,49 +63,46 @@ async fn run() {
             source: wgpu::ShaderSource::Wgsl(include_str!("shader.wgsl").into()),
         })
     };
-    
 
-    println!("shader compilation {:?}", start_instant.elapsed());
     let data_buf = device.create_buffer(&wgpu::BufferDescriptor {
         label: None,
         size: 0x80000,
         usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
         mapped_at_creation: false,
     });
-    let config_buf =  device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
-        label: None,
-        contents: bytemuck::bytes_of(&[STRATEGY, 0]),
-        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::MAP_READ,
-    });
-    // This works if the buffer is initialized, otherwise reads all 0, for some reason.
-    let query_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+    let config_buf = device.create_buffer(&wgpu::BufferDescriptor {
         label: None,
-        contents: &[0; 16],
-        usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+        size: 8,
+        usage: wgpu::BufferUsages::STORAGE
+            | wgpu::BufferUsages::MAP_READ
+            | wgpu::BufferUsages::COPY_DST,
+        mapped_at_creation: false,
     });
 
     let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
         label: None,
-        entries: &[wgpu::BindGroupLayoutEntry {
-            binding: 0,
-            visibility: wgpu::ShaderStages::COMPUTE,
-            ty: wgpu::BindingType::Buffer {
-                ty: wgpu::BufferBindingType::Storage { read_only: false },
-                has_dynamic_offset: false,
-                min_binding_size: None,
+        entries: &[
+            wgpu::BindGroupLayoutEntry {
+                binding: 0,
+                visibility: wgpu::ShaderStages::COMPUTE,
+                ty: wgpu::BindingType::Buffer {
+                    ty: wgpu::BufferBindingType::Storage { read_only: false },
+                    has_dynamic_offset: false,
+                    min_binding_size: None,
+                },
+                count: None,
             },
-            count: None,
-        },
-        wgpu::BindGroupLayoutEntry {
-            binding: 1,
-            visibility: wgpu::ShaderStages::COMPUTE,
-            ty: wgpu::BindingType::Buffer {
-                ty: wgpu::BufferBindingType::Storage { read_only: false },
-                has_dynamic_offset: false,
-                min_binding_size: None,
+            wgpu::BindGroupLayoutEntry {
+                binding: 1,
+                visibility: wgpu::ShaderStages::COMPUTE,
+                ty: wgpu::BindingType::Buffer {
+                    ty: wgpu::BufferBindingType::Storage { read_only: false },
+                    has_dynamic_offset: false,
+                    min_binding_size: None,
+                },
+                count: None,
             },
-            count: None,
-        }],
+        ],
     });
     let compute_pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
         label: None,
@@ -143,44 +131,40 @@ async fn run() {
         ],
     });
 
-    let mut encoder = device.create_command_encoder(&Default::default());
-    if let Some(query_set) = &query_set {
-        encoder.write_timestamp(query_set, 0);
-    }
-    encoder.clear_buffer(&data_buf, 0, None);
-    {
-        let mut cpass = encoder.begin_compute_pass(&Default::default());
-        cpass.set_pipeline(&pipeline);
-        cpass.set_bind_group(0, &bind_group, &[]);
-        cpass.dispatch(256, 1, 1);
-    }
-    if let Some(query_set) = &query_set {
-        encoder.write_timestamp(query_set, 1);
-    }
-    //encoder.copy_buffer_to_buffer(&input_buf, 0, &output_buf, 0, input.len() as u64);
-    if let Some(query_set) = &query_set {
-        encoder.resolve_query_set(query_set, 0..2, &query_buf, 0);
-    }
-    queue.submit(Some(encoder.finish()));
+    let mut failures = 0;
+    for i in 0..1000 {
+        let mut encoder = device.create_command_encoder(&Default::default());
+        if let Some(query_set) = &query_set {
+            encoder.write_timestamp(query_set, 0);
+        }
+        encoder.clear_buffer(&config_buf, 0, None);
+        encoder.clear_buffer(&data_buf, 0, None);
+        {
+            let mut cpass = encoder.begin_compute_pass(&Default::default());
+            cpass.set_pipeline(&pipeline);
+            cpass.set_bind_group(0, &bind_group, &[]);
+            cpass.dispatch(256, 1, 1);
+        }
+        queue.submit(Some(encoder.finish()));
 
-    let buf_slice = config_buf.slice(..);
-    let buf_future = buf_slice.map_async(wgpu::MapMode::Read);
-    let query_slice = query_buf.slice(..);
-    let _query_future = query_slice.map_async(wgpu::MapMode::Read);
-    device.poll(wgpu::Maintain::Wait);
-    if buf_future.await.is_ok() {
-        let data_raw = &*buf_slice.get_mapped_range();
-        let data: &[u32] = bytemuck::cast_slice(data_raw);
-        println!("failures with strategy {}: {}", data[0], data[1]);
+        let buf_slice = config_buf.slice(..);
+        let buf_future = buf_slice.map_async(wgpu::MapMode::Read);
+        device.poll(wgpu::Maintain::Wait);
+        if buf_future.await.is_ok() {
+            let data_raw = buf_slice.get_mapped_range();
+            let data: &[u32] = bytemuck::cast_slice(&*data_raw);
+            if data[1] != 0 {
+                if failures == 0 {
+                    println!("first failing iteration {}, failures: {}", i, data[1]);
+                }
+                failures += data[1];
+            }
+            std::mem::drop(data_raw);
+            config_buf.unmap();
+        }
     }
-    if features.contains(wgpu::Features::TIMESTAMP_QUERY) {
-        let ts_period = queue.get_timestamp_period();
-        let ts_data_raw = &*query_slice.get_mapped_range();
-        let ts_data: &[u64] = bytemuck::cast_slice(ts_data_raw);
-        println!(
-            "compute shader elapsed: {:?}ms",
-            (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6
-        );
+    if failures != 0 {
+        println!("{} total failures", failures);
     }
 }
 
diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl
index 85f1511..f928ba6 100644
--- a/compute-shader-hello/src/shader.wgsl
+++ b/compute-shader-hello/src/shader.wgsl
@@ -40,33 +40,26 @@ var<storage, read_write> control_buf: ControlBuf;
 // should increase the number of failures, as they likely won't
 // be on the same cache line.
 fn permute_flag_ix(data_ix: u32) -> u32 {
-    return (data_ix * 31u) & 0xffffu;
+    return (data_ix * 419u) & 0xffffu;
 }
 
 [[stage(compute), workgroup_size(256)]]
 fn main([[builtin(global_invocation_id)]] global_id: vec3<u32>) {
     let ix = global_id.x;
-    // Originally this was passed in, but is now hardcoded, as D3DCompiler
-    // thinks control flow becomes nonuniform if it's read from input.
-    let n_iter = 1024u;
-    let strategy = control_buf.strategy;
-    var failures = 0u;
-    for (var i: u32 = 0u; i < n_iter; i = i + 1u) {
-        let wr_flag_ix = permute_flag_ix(ix);
-        data_buf.data[ix].data = i + 1u;
-        storageBarrier(); // release semantics for writing flag
-        atomicStore(&data_buf.data[wr_flag_ix].flag, i + 1u);
 
-        // Read from a different workgroup
-        let read_ix = ((ix & 0xffu) << 8u) | (ix >> 8u);
-        let read_flag_ix = permute_flag_ix(read_ix);
+    let wr_flag_ix = permute_flag_ix(ix);
+    data_buf.data[ix].data = 1u;
+    storageBarrier(); // release semantics for writing flag
+    atomicStore(&data_buf.data[wr_flag_ix].flag, 1u);
 
-        let flag = atomicLoad(&data_buf.data[read_flag_ix].flag);
-        storageBarrier(); // acquire semantics for reading flag
-        let data = data_buf.data[read_ix].data;
-        if (flag > data) {
-            failures = failures + 1u;
-        }
+    // Read from a different workgroup
+    let read_ix = (ix * 4099u) & 0xffffu;
+    let read_flag_ix = permute_flag_ix(read_ix);
+
+    let flag = atomicLoad(&data_buf.data[read_flag_ix].flag);
+    storageBarrier(); // acquire semantics for reading flag
+    let data = data_buf.data[read_ix].data;
+    if (flag > data) {
+        let unused = atomicAdd(&control_buf.failures, 1u);
     }
-    let unused = atomicAdd(&control_buf.failures, failures);
 }

From 629effe56ec854f677a673b9299103be7efbed56 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Sun, 31 Oct 2021 08:51:47 -0700
Subject: [PATCH 4/5] Use make_spirv

Slightly cleaner setup of shader source.
---
 compute-shader-hello/src/main.rs | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/compute-shader-hello/src/main.rs b/compute-shader-hello/src/main.rs
index 59831fa..9921620 100644
--- a/compute-shader-hello/src/main.rs
+++ b/compute-shader-hello/src/main.rs
@@ -49,20 +49,15 @@ async fn run() {
         None
     };
 
-    let cs_module = if USE_SPIRV {
-        let shader_src: &[u32] = bytemuck::cast_slice(include_bytes!("shader.spv"));
-        unsafe {
-            device.create_shader_module_spirv(&wgpu::ShaderModuleDescriptorSpirV {
-                label: None,
-                source: std::borrow::Cow::Owned(shader_src.into()),
-            })
-        }
+    let source = if USE_SPIRV {
+        wgpu::util::make_spirv(include_bytes!("shader.spv"))
     } else {
-        device.create_shader_module(&wgpu::ShaderModuleDescriptor {
-            label: None,
-            source: wgpu::ShaderSource::Wgsl(include_str!("shader.wgsl").into()),
-        })
+        wgpu::ShaderSource::Wgsl(include_str!("shader.wgsl").into())
     };
+    let cs_module = device.create_shader_module(&wgpu::ShaderModuleDescriptor {
+        label: None,
+        source,
+    });
 
     let data_buf = device.create_buffer(&wgpu::BufferDescriptor {
         label: None,

From 55843246f06841bca73ab10e42c5f3d65ba2a48f Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Wed, 3 Nov 2021 15:41:13 -0700
Subject: [PATCH 5/5] Switch test to use atomics only

The previous version of the test had a race between a nonatomic store
and a nonatomic load, because the load was not conditional on the atomic
flag. Adding an "if" guard makes the weak behavior observed on AMD
5700 XT go away. If those memory accesses are changed to atomics, then
it's no longer a race, so the observed reorderings seem to be actual
failures of the GPU relative to the Vulkan memory model.

A typical run has 64M measurements (64k per dispatch, 1000 iterations),
of which about 5000 have a flag = 1 but data = 0.
---
 compute-shader-hello/src/shader.wgsl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl
index f928ba6..eb10001 100644
--- a/compute-shader-hello/src/shader.wgsl
+++ b/compute-shader-hello/src/shader.wgsl
@@ -15,7 +15,7 @@
 // Also licensed under MIT license, at your choice.
 
 struct Element {
-    data: u32;
+    data: atomic<u32>;
     flag: atomic<u32>;
 };
 
@@ -48,7 +48,7 @@ fn main([[builtin(global_invocation_id)]] global_id: vec3<u32>) {
     let ix = global_id.x;
 
     let wr_flag_ix = permute_flag_ix(ix);
-    data_buf.data[ix].data = 1u;
+    atomicStore(&data_buf.data[ix].data, 1u);
     storageBarrier(); // release semantics for writing flag
     atomicStore(&data_buf.data[wr_flag_ix].flag, 1u);
 
@@ -58,7 +58,7 @@ fn main([[builtin(global_invocation_id)]] global_id: vec3<u32>) {
 
     let flag = atomicLoad(&data_buf.data[read_flag_ix].flag);
     storageBarrier(); // acquire semantics for reading flag
-    let data = data_buf.data[read_ix].data;
+    let data = atomicLoad(&data_buf.data[read_ix].data);
     if (flag > data) {
         let unused = atomicAdd(&control_buf.failures, 1u);
     }