Skip to content

Commit 69b2ae8

Browse files
authored
Don't reallocate work item buffers every frame. (#17684)
We were calling `clear()` on the work item buffer table, which caused us to deallocate all the CPU side buffers. This patch changes the logic to instead just clear the buffers individually, but leave their backing stores. This has two consequences: 1. To effectively retain work item buffers from frame to frame, we need to key them off `RetainedViewEntity` values and not the render world `Entity`, which is transient. This PR changes those buffers accordingly. 2. We need to clean up work item buffers that belong to views that went away. Amusingly enough, we actually have a system, `delete_old_work_item_buffers`, that tries to do this already, but it wasn't doing anything because the `clear_batched_gpu_instance_buffers` system already handled that. This patch actually makes the `delete_old_work_item_buffers` system useful, by removing the clearing behavior from `clear_batched_gpu_instance_buffers` and instead making `delete_old_work_item_buffers` delete buffers corresponding to nonexistent views. On Bistro, this PR improves the performance of `batch_and_prepare_binned_render_phase` from 61.2 us to 47.8 us, a 28% speedup. ![Screenshot 2025-02-04 135542](https://github.com/user-attachments/assets/b0ecb551-f6c8-4677-8e4e-e39aa28115a3)
1 parent 48049f7 commit 69b2ae8

File tree

3 files changed

+69
-34
lines changed

3 files changed

+69
-34
lines changed

crates/bevy_pbr/src/render/gpu_preprocess.rs

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ use bevy_render::{
4848
},
4949
renderer::{RenderContext, RenderDevice, RenderQueue},
5050
settings::WgpuFeatures,
51-
view::{NoIndirectDrawing, ViewUniform, ViewUniformOffset, ViewUniforms},
51+
view::{ExtractedView, NoIndirectDrawing, ViewUniform, ViewUniformOffset, ViewUniforms},
5252
Render, RenderApp, RenderSet,
5353
};
5454
use bevy_utils::TypeIdMap;
@@ -100,7 +100,7 @@ pub struct GpuMeshPreprocessPlugin {
100100
pub struct EarlyGpuPreprocessNode {
101101
view_query: QueryState<
102102
(
103-
Entity,
103+
Read<ExtractedView>,
104104
Option<Read<PreprocessBindGroups>>,
105105
Option<Read<ViewUniformOffset>>,
106106
Has<NoIndirectDrawing>,
@@ -120,7 +120,11 @@ pub struct EarlyGpuPreprocessNode {
120120
/// metadata for the subsequent [`LatePrepassBuildIndirectParametersNode`].
121121
pub struct LateGpuPreprocessNode {
122122
view_query: QueryState<
123-
(Entity, Read<PreprocessBindGroups>, Read<ViewUniformOffset>),
123+
(
124+
Read<ExtractedView>,
125+
Read<PreprocessBindGroups>,
126+
Read<ViewUniformOffset>,
127+
),
124128
(
125129
Without<SkipGpuPreprocess>,
126130
Without<NoIndirectDrawing>,
@@ -580,7 +584,8 @@ impl Node for EarlyGpuPreprocessNode {
580584
};
581585

582586
// Grab the work item buffers for this view.
583-
let Some(phase_work_item_buffers) = index_buffers.get(&view) else {
587+
let Some(phase_work_item_buffers) = index_buffers.get(&view.retained_view_entity)
588+
else {
584589
warn!("The preprocessing index buffer wasn't present");
585590
continue;
586591
};
@@ -791,7 +796,8 @@ impl Node for LateGpuPreprocessNode {
791796
// Run the compute passes.
792797
for (view, bind_groups, view_uniform_offset) in self.view_query.iter_manual(world) {
793798
// Grab the work item buffers for this view.
794-
let Some(phase_work_item_buffers) = work_item_buffers.get(&view) else {
799+
let Some(phase_work_item_buffers) = work_item_buffers.get(&view.retained_view_entity)
800+
else {
795801
warn!("The preprocessing index buffer wasn't present");
796802
continue;
797803
};
@@ -1619,6 +1625,7 @@ impl BuildIndirectParametersPipeline {
16191625
)]
16201626
pub fn prepare_preprocess_bind_groups(
16211627
mut commands: Commands,
1628+
views: Query<(Entity, &ExtractedView)>,
16221629
view_depth_pyramids: Query<(&ViewDepthPyramid, &PreviousViewUniformOffset)>,
16231630
render_device: Res<RenderDevice>,
16241631
batched_instance_buffers: Res<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
@@ -1651,14 +1658,19 @@ pub fn prepare_preprocess_bind_groups(
16511658
let mut any_indirect = false;
16521659

16531660
// Loop over each view.
1654-
for (view, phase_work_item_buffers) in work_item_buffers {
1661+
for (view_entity, view) in &views {
1662+
let Some(phase_work_item_buffers) = work_item_buffers.get(&view.retained_view_entity)
1663+
else {
1664+
continue;
1665+
};
1666+
16551667
let mut bind_groups = TypeIdMap::default();
16561668

16571669
// Loop over each phase.
16581670
for (&phase_id, work_item_buffers) in phase_work_item_buffers {
16591671
// Create the `PreprocessBindGroupBuilder`.
16601672
let preprocess_bind_group_builder = PreprocessBindGroupBuilder {
1661-
view: *view,
1673+
view: view_entity,
16621674
late_indexed_indirect_parameters_buffer,
16631675
late_non_indexed_indirect_parameters_buffer,
16641676
render_device: &render_device,
@@ -1719,7 +1731,7 @@ pub fn prepare_preprocess_bind_groups(
17191731

17201732
// Save the bind groups.
17211733
commands
1722-
.entity(*view)
1734+
.entity(view_entity)
17231735
.insert(PreprocessBindGroups(bind_groups));
17241736
}
17251737

crates/bevy_render/src/batching/gpu_preprocessing.rs

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ use core::any::TypeId;
44

55
use bevy_app::{App, Plugin};
66
use bevy_ecs::{
7-
entity::{hash_map::EntityHashMap, Entity},
87
query::{Has, With},
98
resource::Resource,
109
schedule::IntoSystemConfigs as _,
@@ -13,7 +12,7 @@ use bevy_ecs::{
1312
};
1413
use bevy_encase_derive::ShaderType;
1514
use bevy_math::UVec4;
16-
use bevy_platform_support::collections::hash_map::Entry;
15+
use bevy_platform_support::collections::{hash_map::Entry, HashMap, HashSet};
1716
use bevy_utils::{default, TypeIdMap};
1817
use bytemuck::{Pod, Zeroable};
1918
use nonmax::NonMaxU32;
@@ -30,7 +29,7 @@ use crate::{
3029
},
3130
render_resource::{Buffer, BufferVec, GpuArrayBufferable, RawBufferVec, UninitBufferVec},
3231
renderer::{RenderAdapter, RenderDevice, RenderQueue},
33-
view::{ExtractedView, NoIndirectDrawing},
32+
view::{ExtractedView, NoIndirectDrawing, RetainedViewEntity},
3433
Render, RenderApp, RenderSet,
3534
};
3635

@@ -157,7 +156,7 @@ where
157156
/// corresponds to each instance.
158157
///
159158
/// This is keyed off each view. Each view has a separate buffer.
160-
pub work_item_buffers: EntityHashMap<TypeIdMap<PreprocessWorkItemBuffers>>,
159+
pub work_item_buffers: HashMap<RetainedViewEntity, TypeIdMap<PreprocessWorkItemBuffers>>,
161160

162161
/// The uniform data inputs for the current frame.
163162
///
@@ -409,8 +408,8 @@ impl Default for LatePreprocessWorkItemIndirectParameters {
409408
/// You may need to call this function if you're implementing your own custom
410409
/// render phases. See the `specialized_mesh_pipeline` example.
411410
pub fn get_or_create_work_item_buffer<'a, I>(
412-
work_item_buffers: &'a mut EntityHashMap<TypeIdMap<PreprocessWorkItemBuffers>>,
413-
view: Entity,
411+
work_item_buffers: &'a mut HashMap<RetainedViewEntity, TypeIdMap<PreprocessWorkItemBuffers>>,
412+
view: RetainedViewEntity,
414413
no_indirect_drawing: bool,
415414
gpu_occlusion_culling: bool,
416415
late_indexed_indirect_parameters_buffer: &'_ mut RawBufferVec<
@@ -493,6 +492,28 @@ impl PreprocessWorkItemBuffers {
493492
}
494493
}
495494
}
495+
496+
/// Clears out the GPU work item buffers in preparation for a new frame.
497+
pub fn clear(&mut self) {
498+
match *self {
499+
PreprocessWorkItemBuffers::Direct(ref mut buffer) => {
500+
buffer.clear();
501+
}
502+
PreprocessWorkItemBuffers::Indirect {
503+
indexed: ref mut indexed_buffer,
504+
non_indexed: ref mut non_indexed_buffer,
505+
ref mut gpu_occlusion_culling,
506+
} => {
507+
indexed_buffer.clear();
508+
non_indexed_buffer.clear();
509+
510+
if let Some(ref mut gpu_occlusion_culling) = *gpu_occlusion_culling {
511+
gpu_occlusion_culling.late_indexed.clear();
512+
gpu_occlusion_culling.late_non_indexed.clear();
513+
}
514+
}
515+
}
516+
}
496517
}
497518

498519
/// One invocation of the preprocessing shader: i.e. one mesh instance in a
@@ -942,7 +963,7 @@ where
942963
pub fn new() -> Self {
943964
BatchedInstanceBuffers {
944965
data_buffer: UninitBufferVec::new(BufferUsages::STORAGE),
945-
work_item_buffers: EntityHashMap::default(),
966+
work_item_buffers: HashMap::default(),
946967
current_input_buffer: InstanceInputUniformBuffer::new(),
947968
previous_input_buffer: InstanceInputUniformBuffer::new(),
948969
late_indexed_indirect_parameters_buffer: RawBufferVec::new(
@@ -968,7 +989,14 @@ where
968989
self.data_buffer.clear();
969990
self.late_indexed_indirect_parameters_buffer.clear();
970991
self.late_non_indexed_indirect_parameters_buffer.clear();
971-
self.work_item_buffers.clear();
992+
993+
// Clear each individual set of buffers, but don't depopulate the hash
994+
// table. We want to avoid reallocating these vectors every frame.
995+
for view_work_item_buffers in self.work_item_buffers.values_mut() {
996+
for phase_work_item_buffers in view_work_item_buffers.values_mut() {
997+
phase_work_item_buffers.clear();
998+
}
999+
}
9721000
}
9731001
}
9741002

@@ -1074,13 +1102,17 @@ pub fn delete_old_work_item_buffers<GFBD>(
10741102
mut gpu_batched_instance_buffers: ResMut<
10751103
BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
10761104
>,
1077-
extracted_views: Query<Entity, With<ExtractedView>>,
1105+
extracted_views: Query<&ExtractedView>,
10781106
) where
10791107
GFBD: GetFullBatchData,
10801108
{
1109+
let retained_view_entities: HashSet<_> = extracted_views
1110+
.iter()
1111+
.map(|extracted_view| extracted_view.retained_view_entity)
1112+
.collect();
10811113
gpu_batched_instance_buffers
10821114
.work_item_buffers
1083-
.retain(|entity, _| extracted_views.contains(*entity));
1115+
.retain(|retained_view_entity, _| retained_view_entities.contains(retained_view_entity));
10841116
}
10851117

10861118
/// Batch the items in a sorted render phase, when GPU instance buffer building
@@ -1091,7 +1123,6 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
10911123
mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
10921124
mut sorted_render_phases: ResMut<ViewSortedRenderPhases<I>>,
10931125
mut views: Query<(
1094-
Entity,
10951126
&ExtractedView,
10961127
Has<NoIndirectDrawing>,
10971128
Has<OcclusionCulling>,
@@ -1110,15 +1141,15 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
11101141
..
11111142
} = gpu_array_buffer.into_inner();
11121143

1113-
for (view, extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views {
1144+
for (extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views {
11141145
let Some(phase) = sorted_render_phases.get_mut(&extracted_view.retained_view_entity) else {
11151146
continue;
11161147
};
11171148

11181149
// Create the work item buffer if necessary.
11191150
let work_item_buffer = get_or_create_work_item_buffer::<I>(
11201151
work_item_buffers,
1121-
view,
1152+
extracted_view.retained_view_entity,
11221153
no_indirect_drawing,
11231154
gpu_occlusion_culling,
11241155
late_indexed_indirect_parameters_buffer,
@@ -1248,7 +1279,6 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
12481279
mut binned_render_phases: ResMut<ViewBinnedRenderPhases<BPI>>,
12491280
mut views: Query<
12501281
(
1251-
Entity,
12521282
&ExtractedView,
12531283
Has<NoIndirectDrawing>,
12541284
Has<OcclusionCulling>,
@@ -1270,7 +1300,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
12701300
..
12711301
} = gpu_array_buffer.into_inner();
12721302

1273-
for (view, extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views {
1303+
for (extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views {
12741304
let Some(phase) = binned_render_phases.get_mut(&extracted_view.retained_view_entity) else {
12751305
continue;
12761306
};
@@ -1279,7 +1309,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
12791309
// used this frame.
12801310
let work_item_buffer = get_or_create_work_item_buffer::<BPI>(
12811311
work_item_buffers,
1282-
view,
1312+
extracted_view.retained_view_entity,
12831313
no_indirect_drawing,
12841314
gpu_occlusion_culling,
12851315
late_indexed_indirect_parameters_buffer,

examples/shader/specialized_mesh_pipeline.rs

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,6 @@ fn queue_custom_mesh_pipeline(
280280
),
281281
mut specialized_mesh_pipelines: ResMut<SpecializedMeshPipelines<CustomMeshPipeline>>,
282282
views: Query<(
283-
Entity,
284283
&RenderVisibleEntities,
285284
&ExtractedView,
286285
&Msaa,
@@ -318,14 +317,8 @@ fn queue_custom_mesh_pipeline(
318317
// Render phases are per-view, so we need to iterate over all views so that
319318
// the entity appears in them. (In this example, we have only one view, but
320319
// it's good practice to loop over all views anyway.)
321-
for (
322-
view_entity,
323-
view_visible_entities,
324-
view,
325-
msaa,
326-
no_indirect_drawing,
327-
gpu_occlusion_culling,
328-
) in views.iter()
320+
for (view_visible_entities, view, msaa, no_indirect_drawing, gpu_occlusion_culling) in
321+
views.iter()
329322
{
330323
let Some(opaque_phase) = opaque_render_phases.get_mut(&view.retained_view_entity) else {
331324
continue;
@@ -336,7 +329,7 @@ fn queue_custom_mesh_pipeline(
336329
// enabled.
337330
let work_item_buffer = gpu_preprocessing::get_or_create_work_item_buffer::<Opaque3d>(
338331
work_item_buffers,
339-
view_entity,
332+
view.retained_view_entity,
340333
no_indirect_drawing,
341334
gpu_occlusion_culling,
342335
late_indexed_indirect_parameters_buffer,

0 commit comments

Comments
 (0)