From 274d15100ac74a58e6f4da4dd360446b5504d43f Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Tue, 26 Oct 2021 20:30:16 -0700 Subject: [PATCH 1/8] First try at prefix sum Sorta works but deadlocks on larger inputs. --- compute-shader-hello/src/main.rs | 44 +++++++++---- compute-shader-hello/src/shader.wgsl | 93 ++++++++++++++++++++++++++-- 2 files changed, 120 insertions(+), 17 deletions(-) diff --git a/compute-shader-hello/src/main.rs b/compute-shader-hello/src/main.rs index 7888e82..d8a91c7 100644 --- a/compute-shader-hello/src/main.rs +++ b/compute-shader-hello/src/main.rs @@ -22,6 +22,9 @@ use wgpu::util::DeviceExt; use bytemuck; +const N_DATA: usize = 16384; +const WG_SIZE: usize = 16; + async fn run() { let instance = wgpu::Instance::new(wgpu::Backends::PRIMARY); let adapter = instance.request_adapter(&Default::default()).await.unwrap(); @@ -30,7 +33,7 @@ async fn run() { .request_device( &wgpu::DeviceDescriptor { label: None, - features: features & wgpu::Features::TIMESTAMP_QUERY, + features: features & (wgpu::Features::TIMESTAMP_QUERY | wgpu::Features::CLEAR_COMMANDS), limits: Default::default(), }, None, @@ -54,8 +57,8 @@ async fn run() { source: wgpu::ShaderSource::Wgsl(include_str!("shader.wgsl").into()), }); println!("shader compilation {:?}", start_instant.elapsed()); - let input_f = &[1.0f32, 2.0f32]; - let input : &[u8] = bytemuck::bytes_of(input_f); + let input_f: Vec = (0..N_DATA as u32).collect(); + let input: &[u8] = bytemuck::cast_slice(&input_f); let input_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor { label: None, contents: input, @@ -69,6 +72,15 @@ async fn run() { usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST, mapped_at_creation: false, }); + const N_WG: usize = N_DATA / WG_SIZE; + const STATE_SIZE: usize = N_WG * 3 + 1; + // TODO: round this up + let state_buf = device.create_buffer(&wgpu::BufferDescriptor { + label: None, + size: 4 * STATE_SIZE as u64, + usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST, + mapped_at_creation: false, + }); // This works if the buffer is initialized, otherwise reads all 0, for some reason. let query_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor { label: None, @@ -87,21 +99,28 @@ async fn run() { let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { label: None, layout: &bind_group_layout, - entries: &[wgpu::BindGroupEntry { - binding: 0, - resource: input_buf.as_entire_binding(), - }], + entries: &[ + wgpu::BindGroupEntry { + binding: 0, + resource: input_buf.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: 1, + resource: state_buf.as_entire_binding(), + }, + ], }); let mut encoder = device.create_command_encoder(&Default::default()); if let Some(query_set) = &query_set { encoder.write_timestamp(query_set, 0); } + encoder.clear_buffer(&state_buf, 0, None); { let mut cpass = encoder.begin_compute_pass(&Default::default()); cpass.set_pipeline(&pipeline); cpass.set_bind_group(0, &bind_group, &[]); - cpass.dispatch(input_f.len() as u32, 1, 1); + cpass.dispatch(N_WG as u32, 1, 1); } if let Some(query_set) = &query_set { encoder.write_timestamp(query_set, 1); @@ -121,14 +140,17 @@ async fn run() { println!("post-poll {:?}", std::time::Instant::now()); if buf_future.await.is_ok() { let data_raw = &*buf_slice.get_mapped_range(); - let data : &[f32] = bytemuck::cast_slice(data_raw); + let data: &[u32] = bytemuck::cast_slice(data_raw); println!("data: {:?}", &*data); } if features.contains(wgpu::Features::TIMESTAMP_QUERY) { let ts_period = queue.get_timestamp_period(); let ts_data_raw = &*query_slice.get_mapped_range(); - let ts_data : &[u64] = bytemuck::cast_slice(ts_data_raw); - println!("compute shader elapsed: {:?}ms", (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6); + let ts_data: &[u64] = bytemuck::cast_slice(ts_data_raw); + println!( + "compute shader elapsed: {:?}ms", + (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6 + ); } } diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl index 468c96e..8e06b4a 100644 --- a/compute-shader-hello/src/shader.wgsl +++ b/compute-shader-hello/src/shader.wgsl @@ -16,14 +16,95 @@ [[block]] struct DataBuf { - data: [[stride(4)]] array; + data: [[stride(4)]] array; +}; + +[[block]] +struct StateBuf { + state: [[stride(4)]] array>; }; [[group(0), binding(0)]] -var v_indices: DataBuf; +var main_buf: DataBuf; + +[[group(0), binding(1)]] +var state_buf: StateBuf; + +let FLAG_NOT_READY = 0u; +let FLAG_AGGREGATE_READY = 1u; +let FLAG_PREFIX_READY = 2u; + +let workgroup_size: u32 = 16u; + +var part_id: u32; +var scratch: array; +var shared_prefix: u32; + +[[stage(compute), workgroup_size(16)]] +fn main([[builtin(local_invocation_id)]] local_id: vec3) { + if (local_id.x == 0u) { + part_id = atomicAdd(&state_buf.state[0], 1u); + } + workgroupBarrier(); + let my_part_id = part_id; + let mem_base = my_part_id * workgroup_size; + var el = main_buf.data[mem_base + local_id.x]; + scratch[local_id.x] = el; + // This must be lg2(workgroup_size) + for (var i: u32 = 0u; i < 4u; i = i + 1u) { + workgroupBarrier(); + if (local_id.x >= (1u << i)) { + el = el + scratch[local_id.x - (1u << i)]; + } + workgroupBarrier(); + scratch[local_id.x] = el; + } + var exclusive_prefix = 0u; + + if (local_id.x == workgroup_size - 1u) { + var flag = FLAG_AGGREGATE_READY; + state_buf.state[my_part_id * 3u + 2u] = el; + if (my_part_id == 0u) { + state_buf.state[my_part_id * 3u + 3u] = el; + flag = FLAG_PREFIX_READY; + } + // TODO: these storage barriers should probably be in + // uniform control flow, but enforcing that is a pain. + storageBarrier(); + state_buf.state[my_part_id * 3u + 1u] = flag; + + if (my_part_id != 0u) { + // decoupled look-back + var look_back_ix = my_part_id - 1u; + loop { + flag = state_buf.state[look_back_ix * 3u + 1u]; + storageBarrier(); + if (flag == FLAG_PREFIX_READY) { + let their_prefix = state_buf.state[look_back_ix * 3u + 3u]; + exclusive_prefix = their_prefix + exclusive_prefix; + break; + } elseif (flag == FLAG_AGGREGATE_READY) { + let their_agg = state_buf.state[look_back_ix * 3u + 2u]; + exclusive_prefix = their_agg + exclusive_prefix; + look_back_ix = look_back_ix - 1u; + } + // else spin + } + + // compute inclusive prefix + let inclusive_prefix = exclusive_prefix + el; + shared_prefix = exclusive_prefix; + state_buf.state[my_part_id * 3u + 3u] = inclusive_prefix; + storageBarrier(); + state_buf.state[my_part_id * 3u + 1u] = FLAG_PREFIX_READY; + } + } + var prefix = 0u; + workgroupBarrier(); + if (my_part_id != 0u) { + prefix = shared_prefix; + } -[[stage(compute), workgroup_size(1)]] -fn main([[builtin(global_invocation_id)]] global_id: vec3) { - // TODO: a more interesting computation than this. - v_indices.data[global_id.x] = v_indices.data[global_id.x] + 42.0; + // do final output + main_buf.data[mem_base + local_id.x] = prefix + el; } From 2800b7391e6b46131576d8116c7c6ba091bc5767 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Tue, 26 Oct 2021 20:38:59 -0700 Subject: [PATCH 2/8] Make storage barriers uniform control flow Still doesn't fix deadlocks tho :/ --- compute-shader-hello/src/shader.wgsl | 48 ++++++++++++++++++---------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl index 8e06b4a..b90ab30 100644 --- a/compute-shader-hello/src/shader.wgsl +++ b/compute-shader-hello/src/shader.wgsl @@ -39,6 +39,7 @@ let workgroup_size: u32 = 16u; var part_id: u32; var scratch: array; var shared_prefix: u32; +var shared_flag: u32; [[stage(compute), workgroup_size(16)]] fn main([[builtin(local_invocation_id)]] local_id: vec3) { @@ -61,41 +62,54 @@ fn main([[builtin(local_invocation_id)]] local_id: vec3) { } var exclusive_prefix = 0u; + var flag = FLAG_AGGREGATE_READY; if (local_id.x == workgroup_size - 1u) { - var flag = FLAG_AGGREGATE_READY; state_buf.state[my_part_id * 3u + 2u] = el; if (my_part_id == 0u) { state_buf.state[my_part_id * 3u + 3u] = el; flag = FLAG_PREFIX_READY; } - // TODO: these storage barriers should probably be in - // uniform control flow, but enforcing that is a pain. - storageBarrier(); + } + // make sure these barriers are in uniform control flow + storageBarrier(); + if (local_id.x == workgroup_size - 1u) { state_buf.state[my_part_id * 3u + 1u] = flag; + } - if (my_part_id != 0u) { - // decoupled look-back - var look_back_ix = my_part_id - 1u; - loop { - flag = state_buf.state[look_back_ix * 3u + 1u]; - storageBarrier(); - if (flag == FLAG_PREFIX_READY) { + if (my_part_id != 0u) { + // decoupled look-back + var look_back_ix = my_part_id - 1u; + loop { + if (local_id.x == workgroup_size - 1u) { + shared_flag = state_buf.state[look_back_ix * 3u + 1u]; + } + workgroupBarrier(); + flag = shared_flag; + storageBarrier(); + if (flag == FLAG_PREFIX_READY) { + if (local_id.x == workgroup_size - 1u) { let their_prefix = state_buf.state[look_back_ix * 3u + 3u]; exclusive_prefix = their_prefix + exclusive_prefix; - break; - } elseif (flag == FLAG_AGGREGATE_READY) { + } + break; + } elseif (flag == FLAG_AGGREGATE_READY) { + if (local_id.x == workgroup_size - 1u) { let their_agg = state_buf.state[look_back_ix * 3u + 2u]; exclusive_prefix = their_agg + exclusive_prefix; - look_back_ix = look_back_ix - 1u; } - // else spin + look_back_ix = look_back_ix - 1u; } + // else spin + } - // compute inclusive prefix + // compute inclusive prefix + if (local_id.x == workgroup_size - 1u) { let inclusive_prefix = exclusive_prefix + el; shared_prefix = exclusive_prefix; state_buf.state[my_part_id * 3u + 3u] = inclusive_prefix; - storageBarrier(); + } + storageBarrier(); + if (local_id.x == workgroup_size - 1u) { state_buf.state[my_part_id * 3u + 1u] = FLAG_PREFIX_READY; } } From 5fc557ce7d6b514db0e09c85568e88b17a1bfc4d Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Mon, 1 Nov 2021 11:41:39 -0700 Subject: [PATCH 3/8] Verify results Still WIP --- compute-shader-hello/src/main.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/compute-shader-hello/src/main.rs b/compute-shader-hello/src/main.rs index d8a91c7..ffc756d 100644 --- a/compute-shader-hello/src/main.rs +++ b/compute-shader-hello/src/main.rs @@ -22,9 +22,14 @@ use wgpu::util::DeviceExt; use bytemuck; -const N_DATA: usize = 16384; +const N_DATA: usize = 1024; const WG_SIZE: usize = 16; +// Verify that the data is OEIS A000217 +fn verify(data: &[u32]) -> bool { + data.iter().enumerate().all(|(i, val)| (i * (i + 1)) / 2 == *val as usize) +} + async fn run() { let instance = wgpu::Instance::new(wgpu::Backends::PRIMARY); let adapter = instance.request_adapter(&Default::default()).await.unwrap(); @@ -141,7 +146,7 @@ async fn run() { if buf_future.await.is_ok() { let data_raw = &*buf_slice.get_mapped_range(); let data: &[u32] = bytemuck::cast_slice(data_raw); - println!("data: {:?}", &*data); + println!("results correct: {}", verify(data)); } if features.contains(wgpu::Features::TIMESTAMP_QUERY) { let ts_period = queue.get_timestamp_period(); From 17094ab6855275e18fc5043c6fa676d1377d0d0a Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Tue, 2 Nov 2021 14:52:34 -0700 Subject: [PATCH 4/8] Larger workgroup Fastest results on AMD at workgroup = 1024. Note, this has atomicOr workaround for correctness. Also note, not all targets will support a workgroup of this size; on shipping, we'd need to query and select at runtime. --- compute-shader-hello/src/main.rs | 4 ++-- compute-shader-hello/src/shader.wgsl | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/compute-shader-hello/src/main.rs b/compute-shader-hello/src/main.rs index ffc756d..bb1fb70 100644 --- a/compute-shader-hello/src/main.rs +++ b/compute-shader-hello/src/main.rs @@ -22,8 +22,8 @@ use wgpu::util::DeviceExt; use bytemuck; -const N_DATA: usize = 1024; -const WG_SIZE: usize = 16; +const N_DATA: usize = 65536; +const WG_SIZE: usize = 1024; // Verify that the data is OEIS A000217 fn verify(data: &[u32]) -> bool { diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl index b90ab30..7d2280d 100644 --- a/compute-shader-hello/src/shader.wgsl +++ b/compute-shader-hello/src/shader.wgsl @@ -34,14 +34,14 @@ let FLAG_NOT_READY = 0u; let FLAG_AGGREGATE_READY = 1u; let FLAG_PREFIX_READY = 2u; -let workgroup_size: u32 = 16u; +let workgroup_size: u32 = 1024u; var part_id: u32; var scratch: array; var shared_prefix: u32; var shared_flag: u32; -[[stage(compute), workgroup_size(16)]] +[[stage(compute), workgroup_size(1024)]] fn main([[builtin(local_invocation_id)]] local_id: vec3) { if (local_id.x == 0u) { part_id = atomicAdd(&state_buf.state[0], 1u); @@ -52,7 +52,7 @@ fn main([[builtin(local_invocation_id)]] local_id: vec3) { var el = main_buf.data[mem_base + local_id.x]; scratch[local_id.x] = el; // This must be lg2(workgroup_size) - for (var i: u32 = 0u; i < 4u; i = i + 1u) { + for (var i: u32 = 0u; i < 10u; i = i + 1u) { workgroupBarrier(); if (local_id.x >= (1u << i)) { el = el + scratch[local_id.x - (1u << i)]; @@ -81,20 +81,20 @@ fn main([[builtin(local_invocation_id)]] local_id: vec3) { var look_back_ix = my_part_id - 1u; loop { if (local_id.x == workgroup_size - 1u) { - shared_flag = state_buf.state[look_back_ix * 3u + 1u]; + shared_flag = atomicOr(&state_buf.state[look_back_ix * 3u + 1u], 0u); } workgroupBarrier(); flag = shared_flag; storageBarrier(); if (flag == FLAG_PREFIX_READY) { if (local_id.x == workgroup_size - 1u) { - let their_prefix = state_buf.state[look_back_ix * 3u + 3u]; + let their_prefix = atomicOr(&state_buf.state[look_back_ix * 3u + 3u], 0u); exclusive_prefix = their_prefix + exclusive_prefix; } break; } elseif (flag == FLAG_AGGREGATE_READY) { if (local_id.x == workgroup_size - 1u) { - let their_agg = state_buf.state[look_back_ix * 3u + 2u]; + let their_agg = atomicOr(&state_buf.state[look_back_ix * 3u + 2u], 0u); exclusive_prefix = their_agg + exclusive_prefix; } look_back_ix = look_back_ix - 1u; From 8174880f03e9e974a4211c05deb0cd7e347d819a Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Tue, 2 Nov 2021 16:02:17 -0700 Subject: [PATCH 5/8] Sequential section Do a small sequential scan at the leaf of the hierarchy. That amortizes both the workgroup-scope tree reduction and the (still sequential) decoupled look-back to a larger number of inputs. Note: this falls short of a real performance evaluation because there's no attempt to warm up the GPU clock. But it's valid as a very rough swag. --- compute-shader-hello/src/main.rs | 17 +++++++++++------ compute-shader-hello/src/shader.wgsl | 23 ++++++++++++++++++----- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/compute-shader-hello/src/main.rs b/compute-shader-hello/src/main.rs index bb1fb70..cf66f5a 100644 --- a/compute-shader-hello/src/main.rs +++ b/compute-shader-hello/src/main.rs @@ -22,12 +22,18 @@ use wgpu::util::DeviceExt; use bytemuck; -const N_DATA: usize = 65536; -const WG_SIZE: usize = 1024; +const N_DATA: usize = 1 << 20; +const WG_SIZE: usize = 1 << 10; // Verify that the data is OEIS A000217 -fn verify(data: &[u32]) -> bool { - data.iter().enumerate().all(|(i, val)| (i * (i + 1)) / 2 == *val as usize) +fn verify(data: &[u32]) -> Option { + data.iter().enumerate().position(|(i, val)| { + let wrong = ((i * (i + 1)) / 2) as u32 != *val; + if wrong { + println!("diff @ {}: {} != {}", i, ((i * (i + 1)) / 2) as u32, *val); + } + wrong + }) } async fn run() { @@ -68,7 +74,6 @@ async fn run() { label: None, contents: input, usage: wgpu::BufferUsages::STORAGE - | wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::COPY_SRC, }); let output_buf = device.create_buffer(&wgpu::BufferDescriptor { @@ -146,7 +151,7 @@ async fn run() { if buf_future.await.is_ok() { let data_raw = &*buf_slice.get_mapped_range(); let data: &[u32] = bytemuck::cast_slice(data_raw); - println!("results correct: {}", verify(data)); + println!("results correct: {:?}", verify(data)); } if features.contains(wgpu::Features::TIMESTAMP_QUERY) { let ts_period = queue.get_timestamp_period(); diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl index 7d2280d..fad4212 100644 --- a/compute-shader-hello/src/shader.wgsl +++ b/compute-shader-hello/src/shader.wgsl @@ -34,14 +34,15 @@ let FLAG_NOT_READY = 0u; let FLAG_AGGREGATE_READY = 1u; let FLAG_PREFIX_READY = 2u; -let workgroup_size: u32 = 1024u; +let workgroup_size: u32 = 256u; +let N_SEQ = 4u; var part_id: u32; var scratch: array; var shared_prefix: u32; var shared_flag: u32; -[[stage(compute), workgroup_size(1024)]] +[[stage(compute), workgroup_size(256)]] fn main([[builtin(local_invocation_id)]] local_id: vec3) { if (local_id.x == 0u) { part_id = atomicAdd(&state_buf.state[0], 1u); @@ -49,10 +50,16 @@ fn main([[builtin(local_invocation_id)]] local_id: vec3) { workgroupBarrier(); let my_part_id = part_id; let mem_base = my_part_id * workgroup_size; - var el = main_buf.data[mem_base + local_id.x]; + var local: array; + var el = main_buf.data[(mem_base + local_id.x) * N_SEQ]; + local[0] = el; + for (var i: u32 = 1u; i < N_SEQ; i = i + 1u) { + el = el + main_buf.data[(mem_base + local_id.x) * N_SEQ + i]; + local[i] = el; + } scratch[local_id.x] = el; // This must be lg2(workgroup_size) - for (var i: u32 = 0u; i < 10u; i = i + 1u) { + for (var i: u32 = 0u; i < 8u; i = i + 1u) { workgroupBarrier(); if (local_id.x >= (1u << i)) { el = el + scratch[local_id.x - (1u << i)]; @@ -120,5 +127,11 @@ fn main([[builtin(local_invocation_id)]] local_id: vec3) { } // do final output - main_buf.data[mem_base + local_id.x] = prefix + el; + for (var i: u32 = 0u; i < N_SEQ; i = i + 1u) { + var old = 0u; + if (local_id.x > 0u) { + old = scratch[local_id.x - 1u]; + } + main_buf.data[(mem_base + local_id.x) * N_SEQ + i] = prefix + old + local[i]; + } } From 93ad8ee0b1d83bb377f66541f77942f2b3ec7da2 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Tue, 2 Nov 2021 16:47:29 -0700 Subject: [PATCH 6/8] Iterate runs Better for performance analaysis --- compute-shader-hello/src/main.rs | 84 +++++++++++++++++--------------- 1 file changed, 44 insertions(+), 40 deletions(-) diff --git a/compute-shader-hello/src/main.rs b/compute-shader-hello/src/main.rs index cf66f5a..da4221e 100644 --- a/compute-shader-hello/src/main.rs +++ b/compute-shader-hello/src/main.rs @@ -22,7 +22,7 @@ use wgpu::util::DeviceExt; use bytemuck; -const N_DATA: usize = 1 << 20; +const N_DATA: usize = 1 << 25; const WG_SIZE: usize = 1 << 10; // Verify that the data is OEIS A000217 @@ -121,46 +121,50 @@ async fn run() { ], }); - let mut encoder = device.create_command_encoder(&Default::default()); - if let Some(query_set) = &query_set { - encoder.write_timestamp(query_set, 0); - } - encoder.clear_buffer(&state_buf, 0, None); - { - let mut cpass = encoder.begin_compute_pass(&Default::default()); - cpass.set_pipeline(&pipeline); - cpass.set_bind_group(0, &bind_group, &[]); - cpass.dispatch(N_WG as u32, 1, 1); - } - if let Some(query_set) = &query_set { - encoder.write_timestamp(query_set, 1); - } - encoder.copy_buffer_to_buffer(&input_buf, 0, &output_buf, 0, input.len() as u64); - if let Some(query_set) = &query_set { - encoder.resolve_query_set(query_set, 0..2, &query_buf, 0); - } - queue.submit(Some(encoder.finish())); + for i in 0..10 { + let mut encoder = device.create_command_encoder(&Default::default()); + if let Some(query_set) = &query_set { + encoder.write_timestamp(query_set, 0); + } + encoder.clear_buffer(&state_buf, 0, None); + { + let mut cpass = encoder.begin_compute_pass(&Default::default()); + cpass.set_pipeline(&pipeline); + cpass.set_bind_group(0, &bind_group, &[]); + cpass.dispatch(N_WG as u32, 1, 1); + } + if let Some(query_set) = &query_set { + encoder.write_timestamp(query_set, 1); + } + encoder.copy_buffer_to_buffer(&input_buf, 0, &output_buf, 0, input.len() as u64); + if let Some(query_set) = &query_set { + encoder.resolve_query_set(query_set, 0..2, &query_buf, 0); + } + queue.submit(Some(encoder.finish())); - let buf_slice = output_buf.slice(..); - let buf_future = buf_slice.map_async(wgpu::MapMode::Read); - let query_slice = query_buf.slice(..); - let _query_future = query_slice.map_async(wgpu::MapMode::Read); - println!("pre-poll {:?}", std::time::Instant::now()); - device.poll(wgpu::Maintain::Wait); - println!("post-poll {:?}", std::time::Instant::now()); - if buf_future.await.is_ok() { - let data_raw = &*buf_slice.get_mapped_range(); - let data: &[u32] = bytemuck::cast_slice(data_raw); - println!("results correct: {:?}", verify(data)); - } - if features.contains(wgpu::Features::TIMESTAMP_QUERY) { - let ts_period = queue.get_timestamp_period(); - let ts_data_raw = &*query_slice.get_mapped_range(); - let ts_data: &[u64] = bytemuck::cast_slice(ts_data_raw); - println!( - "compute shader elapsed: {:?}ms", - (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6 - ); + let buf_slice = output_buf.slice(..); + let buf_future = buf_slice.map_async(wgpu::MapMode::Read); + let query_slice = query_buf.slice(..); + let _query_future = query_slice.map_async(wgpu::MapMode::Read); + device.poll(wgpu::Maintain::Wait); + if buf_future.await.is_ok() { + let data_raw = &*buf_slice.get_mapped_range(); + let data: &[u32] = bytemuck::cast_slice(data_raw); + if i == 0 { + println!("results correct: {:?}", verify(data)); + } + } + if features.contains(wgpu::Features::TIMESTAMP_QUERY) { + let ts_period = queue.get_timestamp_period(); + let ts_data_raw = &*query_slice.get_mapped_range(); + let ts_data: &[u64] = bytemuck::cast_slice(ts_data_raw); + println!( + "compute shader elapsed: {:?}ms", + (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6 + ); + } + output_buf.unmap(); + query_buf.unmap(); } } From 697ea4ecf10c2a36b98fa6d101473f2a7bfbf495 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Tue, 2 Nov 2021 17:25:44 -0700 Subject: [PATCH 7/8] Go fast Performance measurement requires keeping the GPU busy. That means not copying results back to CPU and doing verification there. --- compute-shader-hello/src/main.rs | 34 ++++++++++++++++------------ compute-shader-hello/src/shader.wgsl | 8 +++---- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/compute-shader-hello/src/main.rs b/compute-shader-hello/src/main.rs index da4221e..2334db4 100644 --- a/compute-shader-hello/src/main.rs +++ b/compute-shader-hello/src/main.rs @@ -23,7 +23,7 @@ use wgpu::util::DeviceExt; use bytemuck; const N_DATA: usize = 1 << 25; -const WG_SIZE: usize = 1 << 10; +const WG_SIZE: usize = 1 << 12; // Verify that the data is OEIS A000217 fn verify(data: &[u32]) -> Option { @@ -121,7 +121,7 @@ async fn run() { ], }); - for i in 0..10 { + for i in 0..100 { let mut encoder = device.create_command_encoder(&Default::default()); if let Some(query_set) = &query_set { encoder.write_timestamp(query_set, 0); @@ -136,7 +136,9 @@ async fn run() { if let Some(query_set) = &query_set { encoder.write_timestamp(query_set, 1); } - encoder.copy_buffer_to_buffer(&input_buf, 0, &output_buf, 0, input.len() as u64); + if i == 0 { + encoder.copy_buffer_to_buffer(&input_buf, 0, &output_buf, 0, input.len() as u64); + } if let Some(query_set) = &query_set { encoder.resolve_query_set(query_set, 0..2, &query_buf, 0); } @@ -145,25 +147,27 @@ async fn run() { let buf_slice = output_buf.slice(..); let buf_future = buf_slice.map_async(wgpu::MapMode::Read); let query_slice = query_buf.slice(..); - let _query_future = query_slice.map_async(wgpu::MapMode::Read); + let query_future = query_slice.map_async(wgpu::MapMode::Read); device.poll(wgpu::Maintain::Wait); if buf_future.await.is_ok() { - let data_raw = &*buf_slice.get_mapped_range(); - let data: &[u32] = bytemuck::cast_slice(data_raw); if i == 0 { + let data_raw = &*buf_slice.get_mapped_range(); + let data: &[u32] = bytemuck::cast_slice(data_raw); println!("results correct: {:?}", verify(data)); } + output_buf.unmap(); } - if features.contains(wgpu::Features::TIMESTAMP_QUERY) { - let ts_period = queue.get_timestamp_period(); - let ts_data_raw = &*query_slice.get_mapped_range(); - let ts_data: &[u64] = bytemuck::cast_slice(ts_data_raw); - println!( - "compute shader elapsed: {:?}ms", - (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6 - ); + if query_future.await.is_ok() { + if features.contains(wgpu::Features::TIMESTAMP_QUERY) { + let ts_period = queue.get_timestamp_period(); + let ts_data_raw = &*query_slice.get_mapped_range(); + let ts_data: &[u64] = bytemuck::cast_slice(ts_data_raw); + println!( + "compute shader elapsed: {:?}ms", + (ts_data[1] - ts_data[0]) as f64 * ts_period as f64 * 1e-6 + ); + } } - output_buf.unmap(); query_buf.unmap(); } } diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl index fad4212..25fbc98 100644 --- a/compute-shader-hello/src/shader.wgsl +++ b/compute-shader-hello/src/shader.wgsl @@ -34,15 +34,15 @@ let FLAG_NOT_READY = 0u; let FLAG_AGGREGATE_READY = 1u; let FLAG_PREFIX_READY = 2u; -let workgroup_size: u32 = 256u; -let N_SEQ = 4u; +let workgroup_size: u32 = 512u; +let N_SEQ = 8u; var part_id: u32; var scratch: array; var shared_prefix: u32; var shared_flag: u32; -[[stage(compute), workgroup_size(256)]] +[[stage(compute), workgroup_size(512)]] fn main([[builtin(local_invocation_id)]] local_id: vec3) { if (local_id.x == 0u) { part_id = atomicAdd(&state_buf.state[0], 1u); @@ -59,7 +59,7 @@ fn main([[builtin(local_invocation_id)]] local_id: vec3) { } scratch[local_id.x] = el; // This must be lg2(workgroup_size) - for (var i: u32 = 0u; i < 8u; i = i + 1u) { + for (var i: u32 = 0u; i < 9u; i = i + 1u) { workgroupBarrier(); if (local_id.x >= (1u << i)) { el = el + scratch[local_id.x - (1u << i)]; From 87e5b201a119210649077357214fc88fed552b72 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Wed, 3 Nov 2021 17:19:47 -0700 Subject: [PATCH 8/8] Use explicit atomic stores Naga will accept ordinary loads and stores to atomic types, but tint will not. --- compute-shader-hello/src/shader.wgsl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/compute-shader-hello/src/shader.wgsl b/compute-shader-hello/src/shader.wgsl index 25fbc98..233586d 100644 --- a/compute-shader-hello/src/shader.wgsl +++ b/compute-shader-hello/src/shader.wgsl @@ -71,16 +71,16 @@ fn main([[builtin(local_invocation_id)]] local_id: vec3) { var flag = FLAG_AGGREGATE_READY; if (local_id.x == workgroup_size - 1u) { - state_buf.state[my_part_id * 3u + 2u] = el; + atomicStore(&state_buf.state[my_part_id * 3u + 2u], el); if (my_part_id == 0u) { - state_buf.state[my_part_id * 3u + 3u] = el; + atomicStore(&state_buf.state[my_part_id * 3u + 3u], el); flag = FLAG_PREFIX_READY; } } // make sure these barriers are in uniform control flow storageBarrier(); if (local_id.x == workgroup_size - 1u) { - state_buf.state[my_part_id * 3u + 1u] = flag; + atomicStore(&state_buf.state[my_part_id * 3u + 1u], flag); } if (my_part_id != 0u) { @@ -113,11 +113,11 @@ fn main([[builtin(local_invocation_id)]] local_id: vec3) { if (local_id.x == workgroup_size - 1u) { let inclusive_prefix = exclusive_prefix + el; shared_prefix = exclusive_prefix; - state_buf.state[my_part_id * 3u + 3u] = inclusive_prefix; + atomicStore(&state_buf.state[my_part_id * 3u + 3u], inclusive_prefix); } storageBarrier(); if (local_id.x == workgroup_size - 1u) { - state_buf.state[my_part_id * 3u + 1u] = FLAG_PREFIX_READY; + atomicStore(&state_buf.state[my_part_id * 3u + 1u], FLAG_PREFIX_READY); } } var prefix = 0u;