Skip to content

Commit 57e58ef

Browse files
atlv24SparkyPotatoJMS55tychedelia
authored
Meshlet BVH Culling (#19318)
# Objective - Merge @SparkyPotato 's efforts to implement BVH-accelerated meshlet culling. ## Solution - Add hot reloading support - Fix near-plane overculling - Fix hzb sampling - Fix orthographic error metric ## Testing - Meshlet example, Nsight, hot-reloading and careful thinking --------- Co-authored-by: SparkyPotato <noob.sparkypotato@gmail.com> Co-authored-by: JMS55 <47158642+JMS55@users.noreply.github.com> Co-authored-by: charlotte <charlotte.c.mcelwain@gmail.com>
1 parent 65bddbd commit 57e58ef

26 files changed

+2625
-1260
lines changed

crates/bevy_pbr/src/lib.rs

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ pub mod graph {
124124

125125
use crate::{deferred::DeferredPbrLightingPlugin, graph::NodePbr};
126126
use bevy_app::prelude::*;
127-
use bevy_asset::{load_internal_asset, weak_handle, AssetApp, AssetPath, Assets, Handle};
127+
use bevy_asset::{AssetApp, AssetPath, Assets, Handle};
128128
use bevy_core_pipeline::core_3d::graph::{Core3d, Node3d};
129129
use bevy_ecs::prelude::*;
130130
use bevy_image::Image;
@@ -135,7 +135,7 @@ use bevy_render::{
135135
extract_resource::ExtractResourcePlugin,
136136
load_shader_library,
137137
render_graph::RenderGraph,
138-
render_resource::{Shader, ShaderRef},
138+
render_resource::ShaderRef,
139139
sync_component::SyncComponentPlugin,
140140
view::VisibilitySystems,
141141
ExtractSchedule, Render, RenderApp, RenderDebugFlags, RenderSystems,
@@ -149,9 +149,6 @@ fn shader_ref(path: PathBuf) -> ShaderRef {
149149
ShaderRef::Path(AssetPath::from_path_buf(path).with_source("embedded"))
150150
}
151151

152-
const MESHLET_VISIBILITY_BUFFER_RESOLVE_SHADER_HANDLE: Handle<Shader> =
153-
weak_handle!("69187376-3dea-4d0f-b3f5-185bde63d6a2");
154-
155152
pub const TONEMAPPING_LUT_TEXTURE_BINDING_INDEX: u32 = 18;
156153
pub const TONEMAPPING_LUT_SAMPLER_BINDING_INDEX: u32 = 19;
157154

@@ -205,12 +202,7 @@ impl Plugin for PbrPlugin {
205202
load_shader_library!(app, "render/view_transformations.wgsl");
206203

207204
// Setup dummy shaders for when MeshletPlugin is not used to prevent shader import errors.
208-
load_internal_asset!(
209-
app,
210-
MESHLET_VISIBILITY_BUFFER_RESOLVE_SHADER_HANDLE,
211-
"meshlet/dummy_visibility_buffer_resolve.wgsl",
212-
Shader::from_wgsl
213-
);
205+
load_shader_library!(app, "meshlet/dummy_visibility_buffer_resolve.wgsl");
214206

215207
app.register_asset_reflect::<StandardMaterial>()
216208
.register_type::<AmbientLight>()

crates/bevy_pbr/src/meshlet/asset.rs

Lines changed: 71 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ use bevy_asset::{
66
};
77
use bevy_math::{Vec2, Vec3};
88
use bevy_reflect::TypePath;
9+
use bevy_render::render_resource::ShaderType;
910
use bevy_tasks::block_on;
1011
use bytemuck::{Pod, Zeroable};
11-
use half::f16;
1212
use lz4_flex::frame::{FrameDecoder, FrameEncoder};
1313
use std::io::{Read, Write};
1414
use thiserror::Error;
@@ -17,7 +17,7 @@ use thiserror::Error;
1717
const MESHLET_MESH_ASSET_MAGIC: u64 = 1717551717668;
1818

1919
/// The current version of the [`MeshletMesh`] asset format.
20-
pub const MESHLET_MESH_ASSET_VERSION: u64 = 1;
20+
pub const MESHLET_MESH_ASSET_VERSION: u64 = 2;
2121

2222
/// A mesh that has been pre-processed into multiple small clusters of triangles called meshlets.
2323
///
@@ -47,12 +47,32 @@ pub struct MeshletMesh {
4747
pub(crate) vertex_uvs: Arc<[Vec2]>,
4848
/// Triangle indices for meshlets.
4949
pub(crate) indices: Arc<[u8]>,
50+
/// The BVH8 used for culling and LOD selection of the meshlets. The root is at index 0.
51+
pub(crate) bvh: Arc<[BvhNode]>,
5052
/// The list of meshlets making up this mesh.
5153
pub(crate) meshlets: Arc<[Meshlet]>,
5254
/// Spherical bounding volumes.
53-
pub(crate) meshlet_bounding_spheres: Arc<[MeshletBoundingSpheres]>,
54-
/// Meshlet group and parent group simplification errors.
55-
pub(crate) meshlet_simplification_errors: Arc<[MeshletSimplificationError]>,
55+
pub(crate) meshlet_cull_data: Arc<[MeshletCullData]>,
56+
/// The tight AABB of the meshlet mesh, used for frustum and occlusion culling at the instance
57+
/// level.
58+
pub(crate) aabb: MeshletAabb,
59+
/// The depth of the culling BVH, used to determine the number of dispatches at runtime.
60+
pub(crate) bvh_depth: u32,
61+
}
62+
63+
/// A single BVH8 node in the BVH used for culling and LOD selection of a [`MeshletMesh`].
64+
#[derive(Copy, Clone, Default, Pod, Zeroable)]
65+
#[repr(C)]
66+
pub struct BvhNode {
67+
/// The tight AABBs of this node's children, used for frustum and occlusion during BVH
68+
/// traversal.
69+
pub aabbs: [MeshletAabbErrorOffset; 8],
70+
/// The LOD bounding spheres of this node's children, used for LOD selection during BVH
71+
/// traversal.
72+
pub lod_bounds: [MeshletBoundingSphere; 8],
73+
/// If `u8::MAX`, it indicates that the child of each children is a BVH node, otherwise it is the number of meshlets in the group.
74+
pub child_counts: [u8; 8],
75+
pub _padding: [u32; 2],
5676
}
5777

5878
/// A single meshlet within a [`MeshletMesh`].
@@ -91,31 +111,37 @@ pub struct Meshlet {
91111
/// Bounding spheres used for culling and choosing level of detail for a [`Meshlet`].
92112
#[derive(Copy, Clone, Pod, Zeroable)]
93113
#[repr(C)]
94-
pub struct MeshletBoundingSpheres {
95-
/// Bounding sphere used for frustum and occlusion culling for this meshlet.
96-
pub culling_sphere: MeshletBoundingSphere,
114+
pub struct MeshletCullData {
115+
/// Tight bounding box, used for frustum and occlusion culling for this meshlet.
116+
pub aabb: MeshletAabbErrorOffset,
97117
/// Bounding sphere used for determining if this meshlet's group is at the correct level of detail for a given view.
98118
pub lod_group_sphere: MeshletBoundingSphere,
99-
/// Bounding sphere used for determining if this meshlet's parent group is at the correct level of detail for a given view.
100-
pub lod_parent_group_sphere: MeshletBoundingSphere,
101119
}
102120

103-
/// A spherical bounding volume used for a [`Meshlet`].
104-
#[derive(Copy, Clone, Pod, Zeroable)]
121+
/// An axis-aligned bounding box used for a [`Meshlet`].
122+
#[derive(Copy, Clone, Default, Pod, Zeroable, ShaderType)]
105123
#[repr(C)]
106-
pub struct MeshletBoundingSphere {
124+
pub struct MeshletAabb {
107125
pub center: Vec3,
108-
pub radius: f32,
126+
pub half_extent: Vec3,
109127
}
110128

111-
/// Simplification error used for choosing level of detail for a [`Meshlet`].
112-
#[derive(Copy, Clone, Pod, Zeroable)]
129+
// An axis-aligned bounding box used for a [`Meshlet`].
130+
#[derive(Copy, Clone, Default, Pod, Zeroable, ShaderType)]
131+
#[repr(C)]
132+
pub struct MeshletAabbErrorOffset {
133+
pub center: Vec3,
134+
pub error: f32,
135+
pub half_extent: Vec3,
136+
pub child_offset: u32,
137+
}
138+
139+
/// A spherical bounding volume used for a [`Meshlet`].
140+
#[derive(Copy, Clone, Default, Pod, Zeroable)]
113141
#[repr(C)]
114-
pub struct MeshletSimplificationError {
115-
/// Simplification error used for determining if this meshlet's group is at the correct level of detail for a given view.
116-
pub group_error: f16,
117-
/// Simplification error used for determining if this meshlet's parent group is at the correct level of detail for a given view.
118-
pub parent_group_error: f16,
142+
pub struct MeshletBoundingSphere {
143+
pub center: Vec3,
144+
pub radius: f32,
119145
}
120146

121147
/// An [`AssetSaver`] for `.meshlet_mesh` [`MeshletMesh`] assets.
@@ -143,15 +169,23 @@ impl AssetSaver for MeshletMeshSaver {
143169
.write_all(&MESHLET_MESH_ASSET_VERSION.to_le_bytes())
144170
.await?;
145171

172+
writer.write_all(bytemuck::bytes_of(&asset.aabb)).await?;
173+
writer
174+
.write_all(bytemuck::bytes_of(&asset.bvh_depth))
175+
.await?;
176+
146177
// Compress and write asset data
147178
let mut writer = FrameEncoder::new(AsyncWriteSyncAdapter(writer));
148179
write_slice(&asset.vertex_positions, &mut writer)?;
149180
write_slice(&asset.vertex_normals, &mut writer)?;
150181
write_slice(&asset.vertex_uvs, &mut writer)?;
151182
write_slice(&asset.indices, &mut writer)?;
183+
write_slice(&asset.bvh, &mut writer)?;
152184
write_slice(&asset.meshlets, &mut writer)?;
153-
write_slice(&asset.meshlet_bounding_spheres, &mut writer)?;
154-
write_slice(&asset.meshlet_simplification_errors, &mut writer)?;
185+
write_slice(&asset.meshlet_cull_data, &mut writer)?;
186+
// BUG: Flushing helps with an async_fs bug, but it still fails sometimes. https://github.com/smol-rs/async-fs/issues/45
187+
// ERROR bevy_asset::server: Failed to load asset with asset loader MeshletMeshLoader: failed to fill whole buffer
188+
writer.flush()?;
155189
writer.finish()?;
156190

157191
Ok(())
@@ -184,24 +218,33 @@ impl AssetLoader for MeshletMeshLoader {
184218
return Err(MeshletMeshSaveOrLoadError::WrongVersion { found: version });
185219
}
186220

221+
let mut bytes = [0u8; size_of::<MeshletAabb>()];
222+
reader.read_exact(&mut bytes).await?;
223+
let aabb = bytemuck::cast(bytes);
224+
let mut bytes = [0u8; size_of::<u32>()];
225+
reader.read_exact(&mut bytes).await?;
226+
let bvh_depth = u32::from_le_bytes(bytes);
227+
187228
// Load and decompress asset data
188229
let reader = &mut FrameDecoder::new(AsyncReadSyncAdapter(reader));
189230
let vertex_positions = read_slice(reader)?;
190231
let vertex_normals = read_slice(reader)?;
191232
let vertex_uvs = read_slice(reader)?;
192233
let indices = read_slice(reader)?;
234+
let bvh = read_slice(reader)?;
193235
let meshlets = read_slice(reader)?;
194-
let meshlet_bounding_spheres = read_slice(reader)?;
195-
let meshlet_simplification_errors = read_slice(reader)?;
236+
let meshlet_cull_data = read_slice(reader)?;
196237

197238
Ok(MeshletMesh {
198239
vertex_positions,
199240
vertex_normals,
200241
vertex_uvs,
201242
indices,
243+
bvh,
202244
meshlets,
203-
meshlet_bounding_spheres,
204-
meshlet_simplification_errors,
245+
meshlet_cull_data,
246+
aabb,
247+
bvh_depth,
205248
})
206249
}
207250

@@ -218,7 +261,7 @@ pub enum MeshletMeshSaveOrLoadError {
218261
WrongVersion { found: u64 },
219262
#[error("failed to compress or decompress asset data")]
220263
CompressionOrDecompression(#[from] lz4_flex::frame::Error),
221-
#[error("failed to read or write asset data")]
264+
#[error(transparent)]
222265
Io(#[from] std::io::Error),
223266
}
224267

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
#import bevy_pbr::meshlet_bindings::{
2+
InstancedOffset,
3+
get_aabb,
4+
get_aabb_error,
5+
get_aabb_child_offset,
6+
constants,
7+
meshlet_bvh_nodes,
8+
meshlet_bvh_cull_count_read,
9+
meshlet_bvh_cull_count_write,
10+
meshlet_bvh_cull_dispatch,
11+
meshlet_bvh_cull_queue,
12+
meshlet_meshlet_cull_count_early,
13+
meshlet_meshlet_cull_count_late,
14+
meshlet_meshlet_cull_dispatch_early,
15+
meshlet_meshlet_cull_dispatch_late,
16+
meshlet_meshlet_cull_queue,
17+
meshlet_second_pass_bvh_count,
18+
meshlet_second_pass_bvh_dispatch,
19+
meshlet_second_pass_bvh_queue,
20+
}
21+
#import bevy_pbr::meshlet_cull_shared::{
22+
lod_error_is_imperceptible,
23+
aabb_in_frustum,
24+
should_occlusion_cull_aabb,
25+
}
26+
27+
@compute
28+
@workgroup_size(128, 1, 1) // 8 threads per node, 16 nodes per workgroup
29+
fn cull_bvh(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
30+
// Calculate the queue ID for this thread
31+
let dispatch_id = global_invocation_id.x;
32+
var node = dispatch_id >> 3u;
33+
let subnode = dispatch_id & 7u;
34+
if node >= meshlet_bvh_cull_count_read { return; }
35+
36+
node = select(node, constants.rightmost_slot - node, constants.read_from_front == 0u);
37+
let instanced_offset = meshlet_bvh_cull_queue[node];
38+
let instance_id = instanced_offset.instance_id;
39+
let bvh_node = &meshlet_bvh_nodes[instanced_offset.offset];
40+
41+
var aabb_error_offset = (*bvh_node).aabbs[subnode];
42+
let aabb = get_aabb(&aabb_error_offset);
43+
let parent_error = get_aabb_error(&aabb_error_offset);
44+
let lod_sphere = (*bvh_node).lod_bounds[subnode];
45+
46+
let parent_is_imperceptible = lod_error_is_imperceptible(lod_sphere, parent_error, instance_id);
47+
// Error and frustum cull, in both passes
48+
if parent_is_imperceptible || !aabb_in_frustum(aabb, instance_id) { return; }
49+
50+
let child_offset = get_aabb_child_offset(&aabb_error_offset);
51+
let index = subnode >> 2u;
52+
let bit_offset = subnode & 3u;
53+
let packed_child_count = (*bvh_node).child_counts[index];
54+
let child_count = extractBits(packed_child_count, bit_offset * 8u, 8u);
55+
var value = InstancedOffset(instance_id, child_offset);
56+
57+
// If we pass, try occlusion culling
58+
// If this node was occluded, push it's children to the second pass to check against this frame's HZB
59+
if should_occlusion_cull_aabb(aabb, instance_id) {
60+
#ifdef MESHLET_FIRST_CULLING_PASS
61+
if child_count == 255u {
62+
let id = atomicAdd(&meshlet_second_pass_bvh_count, 1u);
63+
meshlet_second_pass_bvh_queue[id] = value;
64+
if ((id & 15u) == 0u) {
65+
atomicAdd(&meshlet_second_pass_bvh_dispatch.x, 1u);
66+
}
67+
} else {
68+
let base = atomicAdd(&meshlet_meshlet_cull_count_late, child_count);
69+
let start = constants.rightmost_slot - base;
70+
for (var i = start; i < start - child_count; i--) {
71+
meshlet_meshlet_cull_queue[i] = value;
72+
value.offset += 1u;
73+
}
74+
let req = (base + child_count + 127u) >> 7u;
75+
atomicMax(&meshlet_meshlet_cull_dispatch_late.x, req);
76+
}
77+
#endif
78+
return;
79+
}
80+
81+
// If we pass, push the children to the next BVH cull
82+
if child_count == 255u {
83+
let id = atomicAdd(&meshlet_bvh_cull_count_write, 1u);
84+
let index = select(constants.rightmost_slot - id, id, constants.read_from_front == 0u);
85+
meshlet_bvh_cull_queue[index] = value;
86+
if ((id & 15u) == 0u) {
87+
atomicAdd(&meshlet_bvh_cull_dispatch.x, 1u);
88+
}
89+
} else {
90+
#ifdef MESHLET_FIRST_CULLING_PASS
91+
let base = atomicAdd(&meshlet_meshlet_cull_count_early, child_count);
92+
let end = base + child_count;
93+
for (var i = base; i < end; i++) {
94+
meshlet_meshlet_cull_queue[i] = value;
95+
value.offset += 1u;
96+
}
97+
let req = (end + 127u) >> 7u;
98+
atomicMax(&meshlet_meshlet_cull_dispatch_early.x, req);
99+
#else
100+
let base = atomicAdd(&meshlet_meshlet_cull_count_late, child_count);
101+
let start = constants.rightmost_slot - base;
102+
for (var i = start; i < start - child_count; i--) {
103+
meshlet_meshlet_cull_queue[i] = value;
104+
value.offset += 1u;
105+
}
106+
let req = (base + child_count + 127u) >> 7u;
107+
atomicMax(&meshlet_meshlet_cull_dispatch_late.x, req);
108+
#endif
109+
}
110+
}

0 commit comments

Comments
 (0)