Skip to content

Commit a70dd7f

Browse files
committed
Atomic coherency test
This overwrites the compute-shader-hello example to be a test of atomic coherency. My understanding is that with the barriers it should run with 0 failures, even in strategy 0. With strategy 1 (atomicOr) as a workaround, it seems to be working.
1 parent 89e76e7 commit a70dd7f

File tree

2 files changed

+83
-31
lines changed

2 files changed

+83
-31
lines changed

compute-shader-hello/src/main.rs

Lines changed: 34 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ use wgpu::util::DeviceExt;
2222

2323
use bytemuck;
2424

25+
// A strategy of 0 is just atomic loads.
26+
// A strategy of 1 replaces the flag load with an atomicOr.
27+
const STRATEGY: u32 = 0;
28+
2529
async fn run() {
2630
let instance = wgpu::Instance::new(wgpu::Backends::PRIMARY);
2731
let adapter = instance.request_adapter(&Default::default()).await.unwrap();
@@ -30,7 +34,7 @@ async fn run() {
3034
.request_device(
3135
&wgpu::DeviceDescriptor {
3236
label: None,
33-
features: features & wgpu::Features::TIMESTAMP_QUERY,
37+
features: features & (wgpu::Features::TIMESTAMP_QUERY | wgpu::Features::CLEAR_COMMANDS),
3438
limits: Default::default(),
3539
},
3640
None,
@@ -50,24 +54,19 @@ async fn run() {
5054
let start_instant = Instant::now();
5155
let cs_module = device.create_shader_module(&wgpu::ShaderModuleDescriptor {
5256
label: None,
53-
//source: wgpu::ShaderSource::SpirV(bytes_to_u32(include_bytes!("alu.spv")).into()),
5457
source: wgpu::ShaderSource::Wgsl(include_str!("shader.wgsl").into()),
5558
});
5659
println!("shader compilation {:?}", start_instant.elapsed());
57-
let input_f = &[1.0f32, 2.0f32];
58-
let input : &[u8] = bytemuck::bytes_of(input_f);
59-
let input_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
60+
let data_buf = device.create_buffer(&wgpu::BufferDescriptor {
6061
label: None,
61-
contents: input,
62-
usage: wgpu::BufferUsages::STORAGE
63-
| wgpu::BufferUsages::COPY_DST
64-
| wgpu::BufferUsages::COPY_SRC,
62+
size: 0x80000,
63+
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
64+
mapped_at_creation: false,
6565
});
66-
let output_buf = device.create_buffer(&wgpu::BufferDescriptor {
66+
let config_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
6767
label: None,
68-
size: input.len() as u64,
69-
usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
70-
mapped_at_creation: false,
68+
contents: bytemuck::bytes_of(&[STRATEGY, 0]),
69+
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::MAP_READ,
7170
});
7271
// This works if the buffer is initialized, otherwise reads all 0, for some reason.
7372
let query_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
@@ -87,51 +86,60 @@ async fn run() {
8786
let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
8887
label: None,
8988
layout: &bind_group_layout,
90-
entries: &[wgpu::BindGroupEntry {
91-
binding: 0,
92-
resource: input_buf.as_entire_binding(),
93-
}],
89+
entries: &[
90+
wgpu::BindGroupEntry {
91+
binding: 0,
92+
resource: data_buf.as_entire_binding(),
93+
},
94+
wgpu::BindGroupEntry {
95+
binding: 1,
96+
resource: config_buf.as_entire_binding(),
97+
},
98+
],
9499
});
95100

96101
let mut encoder = device.create_command_encoder(&Default::default());
97102
if let Some(query_set) = &query_set {
98103
encoder.write_timestamp(query_set, 0);
99104
}
105+
encoder.clear_buffer(&data_buf, 0, None);
100106
{
101107
let mut cpass = encoder.begin_compute_pass(&Default::default());
102108
cpass.set_pipeline(&pipeline);
103109
cpass.set_bind_group(0, &bind_group, &[]);
104-
cpass.dispatch(input_f.len() as u32, 1, 1);
110+
cpass.dispatch(256, 1, 1);
105111
}
106112
if let Some(query_set) = &query_set {
107113
encoder.write_timestamp(query_set, 1);
108114
}
109-
encoder.copy_buffer_to_buffer(&input_buf, 0, &output_buf, 0, input.len() as u64);
115+
//encoder.copy_buffer_to_buffer(&input_buf, 0, &output_buf, 0, input.len() as u64);
110116
if let Some(query_set) = &query_set {
111117
encoder.resolve_query_set(query_set, 0..2, &query_buf, 0);
112118
}
113119
queue.submit(Some(encoder.finish()));
114120

115-
let buf_slice = output_buf.slice(..);
121+
let buf_slice = config_buf.slice(..);
116122
let buf_future = buf_slice.map_async(wgpu::MapMode::Read);
117123
let query_slice = query_buf.slice(..);
118124
let _query_future = query_slice.map_async(wgpu::MapMode::Read);
119-
println!("pre-poll {:?}", std::time::Instant::now());
120125
device.poll(wgpu::Maintain::Wait);
121-
println!("post-poll {:?}", std::time::Instant::now());
122126
if buf_future.await.is_ok() {
123127
let data_raw = &*buf_slice.get_mapped_range();
124-
let data : &[f32] = bytemuck::cast_slice(data_raw);
125-
println!("data: {:?}", &*data);
128+
let data: &[u32] = bytemuck::cast_slice(data_raw);
129+
println!("failures with strategy {}: {}", data[0], data[1]);
126130
}
127131
if features.contains(wgpu::Features::TIMESTAMP_QUERY) {
128132
let ts_period = queue.get_timestamp_period();
129133
let ts_data_raw = &*query_slice.get_mapped_range();
130-
let ts_data : &[u64] = bytemuck::cast_slice(ts_data_raw);
131-
println!("compute shader elapsed: {:?}ms", (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6);
134+
let ts_data: &[u64] = bytemuck::cast_slice(ts_data_raw);
135+
println!(
136+
"compute shader elapsed: {:?}ms",
137+
(ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6
138+
);
132139
}
133140
}
134141

135142
fn main() {
143+
env_logger::init();
136144
pollster::block_on(run());
137145
}

compute-shader-hello/src/shader.wgsl

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,58 @@
1616

1717
[[block]]
1818
struct DataBuf {
19-
data: [[stride(4)]] array<f32>;
19+
data: [[stride(4)]] array<atomic<u32>>;
20+
};
21+
22+
[[block]]
23+
struct ControlBuf {
24+
strategy: u32;
25+
failures: atomic<u32>;
2026
};
2127

2228
[[group(0), binding(0)]]
23-
var<storage, read_write> v_indices: DataBuf;
29+
var<storage, read_write> data_buf: DataBuf;
30+
31+
[[group(0), binding(1)]]
32+
var<storage, read_write> control_buf: ControlBuf;
2433

25-
[[stage(compute), workgroup_size(1)]]
34+
// Put the flag in quite a different place than the data, which
35+
// should increase the number of failures, as they likely won't
36+
// be on the same cache line.
37+
fn permute_flag_ix(data_ix: u32) -> u32 {
38+
return (data_ix * 31u) & 0xffffu;
39+
}
40+
41+
[[stage(compute), workgroup_size(256)]]
2642
fn main([[builtin(global_invocation_id)]] global_id: vec3<u32>) {
27-
// TODO: a more interesting computation than this.
28-
v_indices.data[global_id.x] = v_indices.data[global_id.x] + 42.0;
43+
let ix = global_id.x;
44+
// Originally this was passed in, but is now hardcoded, as D3DCompiler
45+
// thinks control flow becomes nonuniform if it's read from input.
46+
let n_iter = 1024u;
47+
let strategy = control_buf.strategy;
48+
var failures = 0u;
49+
for (var i: u32 = 0u; i < n_iter; i = i + 1u) {
50+
let wr_flag_ix = permute_flag_ix(ix);
51+
data_buf.data[ix * 2u] = i + 1u;
52+
storageBarrier(); // release semantics for writing flag
53+
data_buf.data[wr_flag_ix * 2u + 1u] = i + 1u;
54+
55+
// Read from a different workgroup
56+
let read_ix = ((ix & 0xffu) << 8u) | (ix >> 8u);
57+
let read_flag_ix = permute_flag_ix(read_ix);
58+
59+
let flag = data_buf.data[read_flag_ix * 2u + 1u];
60+
//let flag = atomicOr(&data_buf.data[read_flag_ix * 2u + 1u], 0u);
61+
storageBarrier(); // acquire semantics for reading flag
62+
var data = 0u;
63+
if (strategy == 0u) {
64+
data = data_buf.data[read_ix * 2u];
65+
} else {
66+
data = atomicOr(&data_buf.data[read_ix * 2u], 0u);
67+
}
68+
if (flag > data) {
69+
failures = failures + 1u;
70+
}
71+
}
72+
let unused = atomicAdd(&control_buf.failures, failures);
2973
}

0 commit comments

Comments
 (0)