Atomic coherency test

raphlinus · raphlinus · commit a70dd7f758c7 · 2021-10-28T17:15:18.000-07:00
This overwrites the compute-shader-hello example to be a test of atomic
coherency. My understanding is that with the barriers it should run with
0 failures, even in strategy 0. With strategy 1 (atomicOr) as a
workaround, it seems to be working.
diff --git a/compute-shader-hello/src/main.rs b/compute-shader-hello/src/main.rs
@@ -22,6 +22,10 @@ use wgpu::util::DeviceExt;
 
 use bytemuck;
 
+// A strategy of 0 is just atomic loads.
+// A strategy of 1 replaces the flag load with an atomicOr.
+const STRATEGY: u32 = 0;
+
 async fn run() {
     let instance = wgpu::Instance::new(wgpu::Backends::PRIMARY);
     let adapter = instance.request_adapter(&Default::default()).await.unwrap();
@@ -30,7 +34,7 @@ async fn run() {
         .request_device(
             &wgpu::DeviceDescriptor {
                 label: None,
-                features: features & wgpu::Features::TIMESTAMP_QUERY,
+                features: features & (wgpu::Features::TIMESTAMP_QUERY | wgpu::Features::CLEAR_COMMANDS),
                 limits: Default::default(),
             },
             None,
@@ -50,24 +54,19 @@ async fn run() {
     let start_instant = Instant::now();
     let cs_module = device.create_shader_module(&wgpu::ShaderModuleDescriptor {
         label: None,
-        //source: wgpu::ShaderSource::SpirV(bytes_to_u32(include_bytes!("alu.spv")).into()),
         source: wgpu::ShaderSource::Wgsl(include_str!("shader.wgsl").into()),
     });
     println!("shader compilation {:?}", start_instant.elapsed());
-    let input_f = &[1.0f32, 2.0f32];
-    let input : &[u8] = bytemuck::bytes_of(input_f);
-    let input_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+    let data_buf = device.create_buffer(&wgpu::BufferDescriptor {
         label: None,
-        contents: input,
-        usage: wgpu::BufferUsages::STORAGE
-            | wgpu::BufferUsages::COPY_DST
-            | wgpu::BufferUsages::COPY_SRC,
+        size: 0x80000,
+        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
+        mapped_at_creation: false,
     });
-    let output_buf = device.create_buffer(&wgpu::BufferDescriptor {
+    let config_buf =  device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
         label: None,
-        size: input.len() as u64,
-        usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
-        mapped_at_creation: false,
+        contents: bytemuck::bytes_of(&[STRATEGY, 0]),
+        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::MAP_READ,
     });
     // This works if the buffer is initialized, otherwise reads all 0, for some reason.
     let query_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
@@ -87,51 +86,60 @@ async fn run() {
     let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
         label: None,
         layout: &bind_group_layout,
-        entries: &[wgpu::BindGroupEntry {
-            binding: 0,
-            resource: input_buf.as_entire_binding(),
-        }],
+        entries: &[
+            wgpu::BindGroupEntry {
+                binding: 0,
+                resource: data_buf.as_entire_binding(),
+            },
+            wgpu::BindGroupEntry {
+                binding: 1,
+                resource: config_buf.as_entire_binding(),
+            },
+        ],
     });
 
     let mut encoder = device.create_command_encoder(&Default::default());
     if let Some(query_set) = &query_set {
         encoder.write_timestamp(query_set, 0);
     }
+    encoder.clear_buffer(&data_buf, 0, None);
     {
         let mut cpass = encoder.begin_compute_pass(&Default::default());
         cpass.set_pipeline(&pipeline);
         cpass.set_bind_group(0, &bind_group, &[]);
-        cpass.dispatch(input_f.len() as u32, 1, 1);
+        cpass.dispatch(256, 1, 1);
     }
     if let Some(query_set) = &query_set {
         encoder.write_timestamp(query_set, 1);
     }
-    encoder.copy_buffer_to_buffer(&input_buf, 0, &output_buf, 0, input.len() as u64);
+    //encoder.copy_buffer_to_buffer(&input_buf, 0, &output_buf, 0, input.len() as u64);
     if let Some(query_set) = &query_set {
         encoder.resolve_query_set(query_set, 0..2, &query_buf, 0);
     }
     queue.submit(Some(encoder.finish()));
 
-    let buf_slice = output_buf.slice(..);
+    let buf_slice = config_buf.slice(..);
     let buf_future = buf_slice.map_async(wgpu::MapMode::Read);
     let query_slice = query_buf.slice(..);
     let _query_future = query_slice.map_async(wgpu::MapMode::Read);
-    println!("pre-poll {:?}", std::time::Instant::now());
     device.poll(wgpu::Maintain::Wait);
-    println!("post-poll {:?}", std::time::Instant::now());
     if buf_future.await.is_ok() {
         let data_raw = &*buf_slice.get_mapped_range();
-        let data : &[f32] = bytemuck::cast_slice(data_raw);
-        println!("data: {:?}", &*data);
+        let data: &[u32] = bytemuck::cast_slice(data_raw);
+        println!("failures with strategy {}: {}", data[0], data[1]);
     }
     if features.contains(wgpu::Features::TIMESTAMP_QUERY) {
         let ts_period = queue.get_timestamp_period();
         let ts_data_raw = &*query_slice.get_mapped_range();
-        let ts_data : &[u64] = bytemuck::cast_slice(ts_data_raw);
-        println!("compute shader elapsed: {:?}ms", (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6);
+        let ts_data: &[u64] = bytemuck::cast_slice(ts_data_raw);
+        println!(
+            "compute shader elapsed: {:?}ms",
+            (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6
+        );
     }
 }
 
 fn main() {
+    env_logger::init();
     pollster::block_on(run());
 }
diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl
@@ -16,14 +16,58 @@
 
 [[block]]
 struct DataBuf {
-    data: [[stride(4)]] array<f32>;
+    data: [[stride(4)]] array<atomic<u32>>;
+};
+
+[[block]]
+struct ControlBuf {
+    strategy: u32;
+    failures: atomic<u32>;
 };
 
 [[group(0), binding(0)]]
-var<storage, read_write> v_indices: DataBuf;
+var<storage, read_write> data_buf: DataBuf;
+
+[[group(0), binding(1)]]
+var<storage, read_write> control_buf: ControlBuf;
 
-[[stage(compute), workgroup_size(1)]]
+// Put the flag in quite a different place than the data, which
+// should increase the number of failures, as they likely won't
+// be on the same cache line.
+fn permute_flag_ix(data_ix: u32) -> u32 {
+    return (data_ix * 31u) & 0xffffu;
+}
+
+[[stage(compute), workgroup_size(256)]]
 fn main([[builtin(global_invocation_id)]] global_id: vec3<u32>) {
-    // TODO: a more interesting computation than this.
-    v_indices.data[global_id.x] = v_indices.data[global_id.x] + 42.0;
+    let ix = global_id.x;
+    // Originally this was passed in, but is now hardcoded, as D3DCompiler
+    // thinks control flow becomes nonuniform if it's read from input.
+    let n_iter = 1024u;
+    let strategy = control_buf.strategy;
+    var failures = 0u;
+    for (var i: u32 = 0u; i < n_iter; i = i + 1u) {
+        let wr_flag_ix = permute_flag_ix(ix);
+        data_buf.data[ix * 2u] = i + 1u;
+        storageBarrier(); // release semantics for writing flag
+        data_buf.data[wr_flag_ix * 2u + 1u] = i + 1u;
+
+        // Read from a different workgroup
+        let read_ix = ((ix & 0xffu) << 8u) | (ix >> 8u);
+        let read_flag_ix = permute_flag_ix(read_ix);
+
+        let flag = data_buf.data[read_flag_ix * 2u + 1u];
+        //let flag = atomicOr(&data_buf.data[read_flag_ix * 2u + 1u], 0u);
+        storageBarrier(); // acquire semantics for reading flag
+        var data = 0u;
+        if (strategy == 0u) {
+            data = data_buf.data[read_ix * 2u];
+        } else {
+            data = atomicOr(&data_buf.data[read_ix * 2u], 0u);
+        }
+        if (flag > data) {
+            failures = failures + 1u;
+        }
+    }
+    let unused = atomicAdd(&control_buf.failures, failures);
 }