From f15437e4dc85afec53463c4f79e502eaa5838878 Mon Sep 17 00:00:00 2001 From: Patrick Walton Date: Thu, 20 Feb 2025 03:45:47 -0800 Subject: [PATCH] Rewrite the multidrawable batch set builder for performance. (#17923) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit restructures the multidrawable batch set builder for better performance in various ways: * The bin traversal is optimized to make the best use of the CPU cache. * The inner loop that iterates over the bins, which is the hottest part of `batch_and_prepare_binned_render_phase`, has been shrunk as small as possible. * Where possible, multiple elements are added to or reserved from GPU buffers as a batch instead of one at a time. * Methods that LLVM wasn't inlining have been marked `#[inline]` where doing so would unlock optimizations. This code has also been refactored to avoid duplication between the logic for indexed and non-indexed meshes via the introduction of a `MultidrawableBatchSetPreparer` object. Together, this improved the `batch_and_prepare_binned_render_phase` time on Caldera by approximately 2×. Eventually, we should optimize the batchable-but-not-multidrawable and unbatchable logic as well, but these meshes are much rarer, so in the interests of keeping this patch relatively small I opted to leave those to a follow-up. --- crates/bevy_pbr/src/render/mesh.rs | 2 +- .../src/batching/gpu_preprocessing.rs | 290 ++++++++++++------ 2 files changed, 198 insertions(+), 94 deletions(-) diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs index 637d4ff846..caaa5877f5 100644 --- a/crates/bevy_pbr/src/render/mesh.rs +++ b/crates/bevy_pbr/src/render/mesh.rs @@ -3138,7 +3138,7 @@ impl RenderCommand

for DrawMesh { indirect_parameters_buffer.get(&TypeId::of::

()) else { warn!( - "Not rendering mesh because indexed indirect parameters buffer \ + "Not rendering mesh because non-indexed indirect parameters buffer \ wasn't present for this phase", ); return RenderCommandResult::Skip; diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs index f1eb7b0e99..661bc901f4 100644 --- a/crates/bevy_render/src/batching/gpu_preprocessing.rs +++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs @@ -18,6 +18,7 @@ use bevy_platform_support::collections::{hash_map::Entry, HashMap, HashSet}; use bevy_utils::{default, TypeIdMap}; use bytemuck::{Pod, Zeroable}; use encase::{internal::WriteInto, ShaderSize}; +use indexmap::IndexMap; use nonmax::NonMaxU32; use tracing::error; use wgpu::{BindingResource, BufferUsages, DownlevelFlags, Features}; @@ -27,11 +28,13 @@ use crate::{ render_phase::{ BinnedPhaseItem, BinnedRenderPhaseBatch, BinnedRenderPhaseBatchSet, BinnedRenderPhaseBatchSets, CachedRenderPipelinePhaseItem, PhaseItem, - PhaseItemBatchSetKey as _, PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase, - UnbatchableBinnedEntityIndices, ViewBinnedRenderPhases, ViewSortedRenderPhases, + PhaseItemBatchSetKey as _, PhaseItemExtraIndex, RenderBin, SortedPhaseItem, + SortedRenderPhase, UnbatchableBinnedEntityIndices, ViewBinnedRenderPhases, + ViewSortedRenderPhases, }, render_resource::{Buffer, GpuArrayBufferable, RawBufferVec, UninitBufferVec}, renderer::{RenderAdapter, RenderDevice, RenderQueue}, + sync_world::MainEntity, view::{ExtractedView, NoIndirectDrawing, RetainedViewEntity}, Render, RenderApp, RenderDebugFlags, RenderSet, }; @@ -902,6 +905,7 @@ impl UntypedPhaseIndirectParametersBuffers { /// to are indexed or not. `indirect_parameters_base` specifies the offset /// within `Self::indexed_data` or `Self::non_indexed_data` of the first /// batch in this batch set. + #[inline] pub fn add_batch_set(&mut self, indexed: bool, indirect_parameters_base: u32) { if indexed { self.indexed.batch_sets.push(IndirectBatchSet { @@ -1472,7 +1476,7 @@ pub fn batch_and_prepare_sorted_render_phase( /// Creates batches for a render phase that uses bins. pub fn batch_and_prepare_binned_render_phase( mut phase_batched_instance_buffers: ResMut>, - mut phase_indirect_parameters_buffers: ResMut>, + phase_indirect_parameters_buffers: ResMut>, mut binned_render_phases: ResMut>, mut views: Query< ( @@ -1489,6 +1493,8 @@ pub fn batch_and_prepare_binned_render_phase( { let system_param_item = param.into_inner(); + let phase_indirect_parameters_buffers = phase_indirect_parameters_buffers.into_inner(); + let UntypedPhaseBatchedInstanceBuffers { ref mut data_buffer, ref mut work_item_buffers, @@ -1519,101 +1525,68 @@ pub fn batch_and_prepare_binned_render_phase( // Prepare multidrawables. - for (batch_set_key, bins) in &phase.multidrawable_meshes { - let mut batch_set = None; - let indirect_parameters_base = phase_indirect_parameters_buffers - .buffers - .batch_count(batch_set_key.indexed()) - as u32; - for (bin_key, bin) in bins { - let mut batch: Option = None; + if let ( + &mut BinnedRenderPhaseBatchSets::MultidrawIndirect(ref mut batch_sets), + &mut PreprocessWorkItemBuffers::Indirect { + indexed: ref mut indexed_work_item_buffer, + non_indexed: ref mut non_indexed_work_item_buffer, + gpu_occlusion_culling: ref mut gpu_occlusion_culling_buffers, + }, + ) = (&mut phase.batch_sets, &mut *work_item_buffer) + { + let mut output_index = data_buffer.len() as u32; - for (&main_entity, &input_index) in bin.entities() { - let output_index = data_buffer.add() as u32; + // Initialize the state for both indexed and non-indexed meshes. + let mut indexed_preparer: MultidrawableBatchSetPreparer = + MultidrawableBatchSetPreparer::new( + phase_indirect_parameters_buffers.buffers.batch_count(true) as u32, + phase_indirect_parameters_buffers + .buffers + .indexed + .batch_sets + .len() as u32, + ); + let mut non_indexed_preparer: MultidrawableBatchSetPreparer = + MultidrawableBatchSetPreparer::new( + phase_indirect_parameters_buffers.buffers.batch_count(false) as u32, + phase_indirect_parameters_buffers + .buffers + .non_indexed + .batch_sets + .len() as u32, + ); - match batch { - Some(ref mut batch) => { - // Append to the current batch. - batch.instance_range.end = output_index + 1; - work_item_buffer.push( - batch_set_key.indexed(), - PreprocessWorkItem { - input_index: *input_index, - output_or_indirect_parameters_index: match batch.extra_index { - PhaseItemExtraIndex::IndirectParametersIndex { - ref range, - .. - } => range.start, - PhaseItemExtraIndex::DynamicOffset(_) - | PhaseItemExtraIndex::None => 0, - }, - }, - ); - } - - None => { - // Start a new batch, in indirect mode. - let indirect_parameters_index = phase_indirect_parameters_buffers - .buffers - .allocate(batch_set_key.indexed(), 1); - let batch_set_index = phase_indirect_parameters_buffers - .buffers - .get_next_batch_set_index(batch_set_key.indexed()); - - GFBD::write_batch_indirect_parameters_metadata( - batch_set_key.indexed(), - output_index, - batch_set_index, - &mut phase_indirect_parameters_buffers.buffers, - indirect_parameters_index, - ); - work_item_buffer.push( - batch_set_key.indexed(), - PreprocessWorkItem { - input_index: *input_index, - output_or_indirect_parameters_index: indirect_parameters_index, - }, - ); - batch = Some(BinnedRenderPhaseBatch { - representative_entity: (Entity::PLACEHOLDER, main_entity), - instance_range: output_index..output_index + 1, - extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index( - NonMaxU32::new(indirect_parameters_index), - ), - }); - } - } - } - - if let Some(batch) = batch { - match batch_set { - None => { - batch_set = Some(BinnedRenderPhaseBatchSet { - first_batch: batch, - batch_count: 1, - bin_key: bin_key.clone(), - index: phase_indirect_parameters_buffers - .buffers - .batch_set_count(batch_set_key.indexed()) - as u32, - }); - } - Some(ref mut batch_set) => { - batch_set.batch_count += 1; - } - } + // Prepare each batch set. + for (batch_set_key, bins) in &phase.multidrawable_meshes { + if batch_set_key.indexed() { + indexed_preparer.prepare_multidrawable_binned_batch_set( + bins, + &mut output_index, + data_buffer, + indexed_work_item_buffer, + &mut phase_indirect_parameters_buffers.buffers.indexed, + batch_sets, + ); + } else { + non_indexed_preparer.prepare_multidrawable_binned_batch_set( + bins, + &mut output_index, + data_buffer, + non_indexed_work_item_buffer, + &mut phase_indirect_parameters_buffers.buffers.non_indexed, + batch_sets, + ); } } - if let BinnedRenderPhaseBatchSets::MultidrawIndirect(ref mut batch_sets) = - phase.batch_sets - { - if let Some(batch_set) = batch_set { - batch_sets.push(batch_set); - phase_indirect_parameters_buffers - .buffers - .add_batch_set(batch_set_key.indexed(), indirect_parameters_base); - } + // Reserve space in the occlusion culling buffers, if necessary. + if let Some(ref mut gpu_occlusion_culling_buffers) = gpu_occlusion_culling_buffers { + gpu_occlusion_culling_buffers + .late_indexed + .add_multiple(indexed_preparer.work_item_count); + gpu_occlusion_culling_buffers + .late_non_indexed + .add_multiple(non_indexed_preparer.work_item_count); } } @@ -1814,6 +1787,137 @@ pub fn batch_and_prepare_binned_render_phase( } } +/// The state that [`batch_and_prepare_binned_render_phase`] uses to construct +/// multidrawable batch sets. +/// +/// The [`batch_and_prepare_binned_render_phase`] system maintains two of these: +/// one for indexed meshes and one for non-indexed meshes. +struct MultidrawableBatchSetPreparer +where + BPI: BinnedPhaseItem, + GFBD: GetFullBatchData, +{ + /// The offset in the indirect parameters buffer at which the next indirect + /// parameters will be written. + indirect_parameters_index: u32, + /// The number of batch sets we've built so far for this mesh class. + batch_set_index: u32, + /// The number of work items we've emitted so far for this mesh class. + work_item_count: usize, + phantom: PhantomData<(BPI, GFBD)>, +} + +impl MultidrawableBatchSetPreparer +where + BPI: BinnedPhaseItem, + GFBD: GetFullBatchData, +{ + /// Creates a new [`MultidrawableBatchSetPreparer`] that will start writing + /// indirect parameters and batch sets at the given indices. + #[inline] + fn new(initial_indirect_parameters_index: u32, initial_batch_set_index: u32) -> Self { + MultidrawableBatchSetPreparer { + indirect_parameters_index: initial_indirect_parameters_index, + batch_set_index: initial_batch_set_index, + work_item_count: 0, + phantom: PhantomData, + } + } + + /// Creates batch sets and writes the GPU data needed to draw all visible + /// entities of one mesh class in the given batch set. + /// + /// The *mesh class* represents whether the mesh has indices or not. + #[inline] + fn prepare_multidrawable_binned_batch_set( + &mut self, + bins: &IndexMap, + output_index: &mut u32, + data_buffer: &mut UninitBufferVec, + indexed_work_item_buffer: &mut RawBufferVec, + mesh_class_buffers: &mut MeshClassIndirectParametersBuffers, + batch_sets: &mut Vec>, + ) where + IP: Clone + ShaderSize + WriteInto, + { + let current_indexed_batch_set_index = self.batch_set_index; + let current_output_index = *output_index; + + let indirect_parameters_base = self.indirect_parameters_index; + + // We're going to write the first entity into the batch set. Do this + // here so that we can preload the bin into cache as a side effect. + let Some((first_bin_key, first_bin)) = bins.iter().next() else { + return; + }; + let first_bin_len = first_bin.entities().len(); + let first_bin_entity = first_bin + .entities() + .keys() + .next() + .copied() + .unwrap_or(MainEntity::from(Entity::PLACEHOLDER)); + + // Traverse the batch set, processing each bin. + for bin in bins.values() { + // Record the first output index for this batch, as well as its own + // index. + mesh_class_buffers + .cpu_metadata + .push(IndirectParametersCpuMetadata { + base_output_index: *output_index, + batch_set_index: self.batch_set_index, + }); + + // Traverse the bin, pushing `PreprocessWorkItem`s for each entity + // within it. This is a hot loop, so make it as fast as possible. + for &input_index in bin.entities().values() { + indexed_work_item_buffer.push(PreprocessWorkItem { + input_index: *input_index, + output_or_indirect_parameters_index: self.indirect_parameters_index, + }); + } + + // Reserve space for the appropriate number of entities in the data + // buffer. Also, advance the output index and work item count. + let bin_entity_count = bin.entities().len(); + data_buffer.add_multiple(bin_entity_count); + *output_index += bin_entity_count as u32; + self.work_item_count += bin_entity_count; + + self.indirect_parameters_index += 1; + } + + // Reserve space for the bins in this batch set in the GPU buffers. + let bin_count = bins.len(); + mesh_class_buffers.gpu_metadata.add_multiple(bin_count); + mesh_class_buffers.data.add_multiple(bin_count); + + // Write the information the GPU will need about this batch set. + mesh_class_buffers.batch_sets.push(IndirectBatchSet { + indirect_parameters_base, + indirect_parameters_count: 0, + }); + + self.batch_set_index += 1; + + // Record the batch set. The render node later processes this record to + // render the batches. + batch_sets.push(BinnedRenderPhaseBatchSet { + first_batch: BinnedRenderPhaseBatch { + representative_entity: (Entity::PLACEHOLDER, first_bin_entity), + instance_range: current_output_index..(current_output_index + first_bin_len as u32), + extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index(NonMaxU32::new( + indirect_parameters_base, + )), + }, + bin_key: (*first_bin_key).clone(), + batch_count: self.indirect_parameters_index - indirect_parameters_base, + index: current_indexed_batch_set_index, + }); + } +} + /// A system that gathers up the per-phase GPU buffers and inserts them into the /// [`BatchedInstanceBuffers`] and [`IndirectParametersBuffers`] tables. ///