Rewrite the multidrawable batch set builder for performance. (#17923)

This commit restructures the multidrawable batch set builder for better performance in various ways: * The bin traversal is optimized to make the best use of the CPU cache. * The inner loop that iterates over the bins, which is the hottest part of `batch_and_prepare_binned_render_phase`, has been shrunk as small as possible. * Where possible, multiple elements are added to or reserved from GPU buffers as a batch instead of one at a time. * Methods that LLVM wasn't inlining have been marked `#[inline]` where doing so would unlock optimizations. This code has also been refactored to avoid duplication between the logic for indexed and non-indexed meshes via the introduction of a `MultidrawableBatchSetPreparer` object. Together, this improved the `batch_and_prepare_binned_render_phase` time on Caldera by approximately 2×. Eventually, we should optimize the batchable-but-not-multidrawable and unbatchable logic as well, but these meshes are much rarer, so in the interests of keeping this patch relatively small I opted to leave those to a follow-up.
2025-02-20 03:45:47 -08:00 · 2025-02-20 03:45:47 -08:00 · f15437e4dc
commit f15437e4dc
parent 9e11e96a59
2 changed files with 198 additions and 94 deletions
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@ -3138,7 +3138,7 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
                        indirect_parameters_buffer.get(&TypeId::of::<P>())
                    else {
                        warn!(
-                            "Not rendering mesh because indexed indirect parameters buffer \
+                            "Not rendering mesh because non-indexed indirect parameters buffer \
                                 wasn't present for this phase",
                        );
                        return RenderCommandResult::Skip;
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@ -18,6 +18,7 @@ use bevy_platform_support::collections::{hash_map::Entry, HashMap, HashSet};
 use bevy_utils::{default, TypeIdMap};
 use bytemuck::{Pod, Zeroable};
 use encase::{internal::WriteInto, ShaderSize};
+use indexmap::IndexMap;
 use nonmax::NonMaxU32;
 use tracing::error;
 use wgpu::{BindingResource, BufferUsages, DownlevelFlags, Features};
@ -27,11 +28,13 @@ use crate::{
    render_phase::{
        BinnedPhaseItem, BinnedRenderPhaseBatch, BinnedRenderPhaseBatchSet,
        BinnedRenderPhaseBatchSets, CachedRenderPipelinePhaseItem, PhaseItem,
-        PhaseItemBatchSetKey as _, PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase,
-        UnbatchableBinnedEntityIndices, ViewBinnedRenderPhases, ViewSortedRenderPhases,
+        PhaseItemBatchSetKey as _, PhaseItemExtraIndex, RenderBin, SortedPhaseItem,
+        SortedRenderPhase, UnbatchableBinnedEntityIndices, ViewBinnedRenderPhases,
+        ViewSortedRenderPhases,
    },
    render_resource::{Buffer, GpuArrayBufferable, RawBufferVec, UninitBufferVec},
    renderer::{RenderAdapter, RenderDevice, RenderQueue},
+    sync_world::MainEntity,
    view::{ExtractedView, NoIndirectDrawing, RetainedViewEntity},
    Render, RenderApp, RenderDebugFlags, RenderSet,
 };
@ -902,6 +905,7 @@ impl UntypedPhaseIndirectParametersBuffers {
    /// to are indexed or not. `indirect_parameters_base` specifies the offset
    /// within `Self::indexed_data` or `Self::non_indexed_data` of the first
    /// batch in this batch set.
+    #[inline]
    pub fn add_batch_set(&mut self, indexed: bool, indirect_parameters_base: u32) {
        if indexed {
            self.indexed.batch_sets.push(IndirectBatchSet {
@ -1472,7 +1476,7 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
 /// Creates batches for a render phase that uses bins.
 pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
    mut phase_batched_instance_buffers: ResMut<PhaseBatchedInstanceBuffers<BPI, GFBD::BufferData>>,
-    mut phase_indirect_parameters_buffers: ResMut<PhaseIndirectParametersBuffers<BPI>>,
+    phase_indirect_parameters_buffers: ResMut<PhaseIndirectParametersBuffers<BPI>>,
    mut binned_render_phases: ResMut<ViewBinnedRenderPhases<BPI>>,
    mut views: Query<
        (
@ -1489,6 +1493,8 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
 {
    let system_param_item = param.into_inner();

+    let phase_indirect_parameters_buffers = phase_indirect_parameters_buffers.into_inner();
+
    let UntypedPhaseBatchedInstanceBuffers {
        ref mut data_buffer,
        ref mut work_item_buffers,
@ -1519,101 +1525,68 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(

        // Prepare multidrawables.

-        for (batch_set_key, bins) in &phase.multidrawable_meshes {
-            let mut batch_set = None;
-            let indirect_parameters_base = phase_indirect_parameters_buffers
-                .buffers
-                .batch_count(batch_set_key.indexed())
-                as u32;
-            for (bin_key, bin) in bins {
-                let mut batch: Option<BinnedRenderPhaseBatch> = None;
+        if let (
+            &mut BinnedRenderPhaseBatchSets::MultidrawIndirect(ref mut batch_sets),
+            &mut PreprocessWorkItemBuffers::Indirect {
+                indexed: ref mut indexed_work_item_buffer,
+                non_indexed: ref mut non_indexed_work_item_buffer,
+                gpu_occlusion_culling: ref mut gpu_occlusion_culling_buffers,
+            },
+        ) = (&mut phase.batch_sets, &mut *work_item_buffer)
+        {
+            let mut output_index = data_buffer.len() as u32;

-                for (&main_entity, &input_index) in bin.entities() {
-                    let output_index = data_buffer.add() as u32;
+            // Initialize the state for both indexed and non-indexed meshes.
+            let mut indexed_preparer: MultidrawableBatchSetPreparer<BPI, GFBD> =
+                MultidrawableBatchSetPreparer::new(
+                    phase_indirect_parameters_buffers.buffers.batch_count(true) as u32,
+                    phase_indirect_parameters_buffers
+                        .buffers
+                        .indexed
+                        .batch_sets
+                        .len() as u32,
+                );
+            let mut non_indexed_preparer: MultidrawableBatchSetPreparer<BPI, GFBD> =
+                MultidrawableBatchSetPreparer::new(
+                    phase_indirect_parameters_buffers.buffers.batch_count(false) as u32,
+                    phase_indirect_parameters_buffers
+                        .buffers
+                        .non_indexed
+                        .batch_sets
+                        .len() as u32,
+                );

-                    match batch {
-                        Some(ref mut batch) => {
-                            // Append to the current batch.
-                            batch.instance_range.end = output_index + 1;
-                            work_item_buffer.push(
-                                batch_set_key.indexed(),
-                                PreprocessWorkItem {
-                                    input_index: *input_index,
-                                    output_or_indirect_parameters_index: match batch.extra_index {
-                                        PhaseItemExtraIndex::IndirectParametersIndex {
-                                            ref range,
-                                            ..
-                                        } => range.start,
-                                        PhaseItemExtraIndex::DynamicOffset(_)
-                                        | PhaseItemExtraIndex::None => 0,
-                                    },
-                                },
-                            );
-                        }
-
-                        None => {
-                            // Start a new batch, in indirect mode.
-                            let indirect_parameters_index = phase_indirect_parameters_buffers
-                                .buffers
-                                .allocate(batch_set_key.indexed(), 1);
-                            let batch_set_index = phase_indirect_parameters_buffers
-                                .buffers
-                                .get_next_batch_set_index(batch_set_key.indexed());
-
-                            GFBD::write_batch_indirect_parameters_metadata(
-                                batch_set_key.indexed(),
-                                output_index,
-                                batch_set_index,
-                                &mut phase_indirect_parameters_buffers.buffers,
-                                indirect_parameters_index,
-                            );
-                            work_item_buffer.push(
-                                batch_set_key.indexed(),
-                                PreprocessWorkItem {
-                                    input_index: *input_index,
-                                    output_or_indirect_parameters_index: indirect_parameters_index,
-                                },
-                            );
-                            batch = Some(BinnedRenderPhaseBatch {
-                                representative_entity: (Entity::PLACEHOLDER, main_entity),
-                                instance_range: output_index..output_index + 1,
-                                extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index(
-                                    NonMaxU32::new(indirect_parameters_index),
-                                ),
-                            });
-                        }
-                    }
-                }
-
-                if let Some(batch) = batch {
-                    match batch_set {
-                        None => {
-                            batch_set = Some(BinnedRenderPhaseBatchSet {
-                                first_batch: batch,
-                                batch_count: 1,
-                                bin_key: bin_key.clone(),
-                                index: phase_indirect_parameters_buffers
-                                    .buffers
-                                    .batch_set_count(batch_set_key.indexed())
-                                    as u32,
-                            });
-                        }
-                        Some(ref mut batch_set) => {
-                            batch_set.batch_count += 1;
-                        }
-                    }
+            // Prepare each batch set.
+            for (batch_set_key, bins) in &phase.multidrawable_meshes {
+                if batch_set_key.indexed() {
+                    indexed_preparer.prepare_multidrawable_binned_batch_set(
+                        bins,
+                        &mut output_index,
+                        data_buffer,
+                        indexed_work_item_buffer,
+                        &mut phase_indirect_parameters_buffers.buffers.indexed,
+                        batch_sets,
+                    );
+                } else {
+                    non_indexed_preparer.prepare_multidrawable_binned_batch_set(
+                        bins,
+                        &mut output_index,
+                        data_buffer,
+                        non_indexed_work_item_buffer,
+                        &mut phase_indirect_parameters_buffers.buffers.non_indexed,
+                        batch_sets,
+                    );
                }
            }

-            if let BinnedRenderPhaseBatchSets::MultidrawIndirect(ref mut batch_sets) =
-                phase.batch_sets
-            {
-                if let Some(batch_set) = batch_set {
-                    batch_sets.push(batch_set);
-                    phase_indirect_parameters_buffers
-                        .buffers
-                        .add_batch_set(batch_set_key.indexed(), indirect_parameters_base);
-                }
+            // Reserve space in the occlusion culling buffers, if necessary.
+            if let Some(ref mut gpu_occlusion_culling_buffers) = gpu_occlusion_culling_buffers {
+                gpu_occlusion_culling_buffers
+                    .late_indexed
+                    .add_multiple(indexed_preparer.work_item_count);
+                gpu_occlusion_culling_buffers
+                    .late_non_indexed
+                    .add_multiple(non_indexed_preparer.work_item_count);
            }
        }

@ -1814,6 +1787,137 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
    }
 }

+/// The state that [`batch_and_prepare_binned_render_phase`] uses to construct
+/// multidrawable batch sets.
+///
+/// The [`batch_and_prepare_binned_render_phase`] system maintains two of these:
+/// one for indexed meshes and one for non-indexed meshes.
+struct MultidrawableBatchSetPreparer<BPI, GFBD>
+where
+    BPI: BinnedPhaseItem,
+    GFBD: GetFullBatchData,
+{
+    /// The offset in the indirect parameters buffer at which the next indirect
+    /// parameters will be written.
+    indirect_parameters_index: u32,
+    /// The number of batch sets we've built so far for this mesh class.
+    batch_set_index: u32,
+    /// The number of work items we've emitted so far for this mesh class.
+    work_item_count: usize,
+    phantom: PhantomData<(BPI, GFBD)>,
+}
+
+impl<BPI, GFBD> MultidrawableBatchSetPreparer<BPI, GFBD>
+where
+    BPI: BinnedPhaseItem,
+    GFBD: GetFullBatchData,
+{
+    /// Creates a new [`MultidrawableBatchSetPreparer`] that will start writing
+    /// indirect parameters and batch sets at the given indices.
+    #[inline]
+    fn new(initial_indirect_parameters_index: u32, initial_batch_set_index: u32) -> Self {
+        MultidrawableBatchSetPreparer {
+            indirect_parameters_index: initial_indirect_parameters_index,
+            batch_set_index: initial_batch_set_index,
+            work_item_count: 0,
+            phantom: PhantomData,
+        }
+    }
+
+    /// Creates batch sets and writes the GPU data needed to draw all visible
+    /// entities of one mesh class in the given batch set.
+    ///
+    /// The *mesh class* represents whether the mesh has indices or not.
+    #[inline]
+    fn prepare_multidrawable_binned_batch_set<IP>(
+        &mut self,
+        bins: &IndexMap<BPI::BinKey, RenderBin>,
+        output_index: &mut u32,
+        data_buffer: &mut UninitBufferVec<GFBD::BufferData>,
+        indexed_work_item_buffer: &mut RawBufferVec<PreprocessWorkItem>,
+        mesh_class_buffers: &mut MeshClassIndirectParametersBuffers<IP>,
+        batch_sets: &mut Vec<BinnedRenderPhaseBatchSet<BPI::BinKey>>,
+    ) where
+        IP: Clone + ShaderSize + WriteInto,
+    {
+        let current_indexed_batch_set_index = self.batch_set_index;
+        let current_output_index = *output_index;
+
+        let indirect_parameters_base = self.indirect_parameters_index;
+
+        // We're going to write the first entity into the batch set. Do this
+        // here so that we can preload the bin into cache as a side effect.
+        let Some((first_bin_key, first_bin)) = bins.iter().next() else {
+            return;
+        };
+        let first_bin_len = first_bin.entities().len();
+        let first_bin_entity = first_bin
+            .entities()
+            .keys()
+            .next()
+            .copied()
+            .unwrap_or(MainEntity::from(Entity::PLACEHOLDER));
+
+        // Traverse the batch set, processing each bin.
+        for bin in bins.values() {
+            // Record the first output index for this batch, as well as its own
+            // index.
+            mesh_class_buffers
+                .cpu_metadata
+                .push(IndirectParametersCpuMetadata {
+                    base_output_index: *output_index,
+                    batch_set_index: self.batch_set_index,
+                });
+
+            // Traverse the bin, pushing `PreprocessWorkItem`s for each entity
+            // within it. This is a hot loop, so make it as fast as possible.
+            for &input_index in bin.entities().values() {
+                indexed_work_item_buffer.push(PreprocessWorkItem {
+                    input_index: *input_index,
+                    output_or_indirect_parameters_index: self.indirect_parameters_index,
+                });
+            }
+
+            // Reserve space for the appropriate number of entities in the data
+            // buffer. Also, advance the output index and work item count.
+            let bin_entity_count = bin.entities().len();
+            data_buffer.add_multiple(bin_entity_count);
+            *output_index += bin_entity_count as u32;
+            self.work_item_count += bin_entity_count;
+
+            self.indirect_parameters_index += 1;
+        }
+
+        // Reserve space for the bins in this batch set in the GPU buffers.
+        let bin_count = bins.len();
+        mesh_class_buffers.gpu_metadata.add_multiple(bin_count);
+        mesh_class_buffers.data.add_multiple(bin_count);
+
+        // Write the information the GPU will need about this batch set.
+        mesh_class_buffers.batch_sets.push(IndirectBatchSet {
+            indirect_parameters_base,
+            indirect_parameters_count: 0,
+        });
+
+        self.batch_set_index += 1;
+
+        // Record the batch set. The render node later processes this record to
+        // render the batches.
+        batch_sets.push(BinnedRenderPhaseBatchSet {
+            first_batch: BinnedRenderPhaseBatch {
+                representative_entity: (Entity::PLACEHOLDER, first_bin_entity),
+                instance_range: current_output_index..(current_output_index + first_bin_len as u32),
+                extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index(NonMaxU32::new(
+                    indirect_parameters_base,
+                )),
+            },
+            bin_key: (*first_bin_key).clone(),
+            batch_count: self.indirect_parameters_index - indirect_parameters_base,
+            index: current_indexed_batch_set_index,
+        });
+    }
+}
+
 /// A system that gathers up the per-phase GPU buffers and inserts them into the
 /// [`BatchedInstanceBuffers`] and [`IndirectParametersBuffers`] tables.
 ///