Rewrite the multidrawable batch set builder for performance. (#17923)
This commit restructures the multidrawable batch set builder for better performance in various ways: * The bin traversal is optimized to make the best use of the CPU cache. * The inner loop that iterates over the bins, which is the hottest part of `batch_and_prepare_binned_render_phase`, has been shrunk as small as possible. * Where possible, multiple elements are added to or reserved from GPU buffers as a batch instead of one at a time. * Methods that LLVM wasn't inlining have been marked `#[inline]` where doing so would unlock optimizations. This code has also been refactored to avoid duplication between the logic for indexed and non-indexed meshes via the introduction of a `MultidrawableBatchSetPreparer` object. Together, this improved the `batch_and_prepare_binned_render_phase` time on Caldera by approximately 2×. Eventually, we should optimize the batchable-but-not-multidrawable and unbatchable logic as well, but these meshes are much rarer, so in the interests of keeping this patch relatively small I opted to leave those to a follow-up.
This commit is contained in:
parent
9e11e96a59
commit
f15437e4dc
@ -3138,7 +3138,7 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
|
|||||||
indirect_parameters_buffer.get(&TypeId::of::<P>())
|
indirect_parameters_buffer.get(&TypeId::of::<P>())
|
||||||
else {
|
else {
|
||||||
warn!(
|
warn!(
|
||||||
"Not rendering mesh because indexed indirect parameters buffer \
|
"Not rendering mesh because non-indexed indirect parameters buffer \
|
||||||
wasn't present for this phase",
|
wasn't present for this phase",
|
||||||
);
|
);
|
||||||
return RenderCommandResult::Skip;
|
return RenderCommandResult::Skip;
|
||||||
|
|||||||
@ -18,6 +18,7 @@ use bevy_platform_support::collections::{hash_map::Entry, HashMap, HashSet};
|
|||||||
use bevy_utils::{default, TypeIdMap};
|
use bevy_utils::{default, TypeIdMap};
|
||||||
use bytemuck::{Pod, Zeroable};
|
use bytemuck::{Pod, Zeroable};
|
||||||
use encase::{internal::WriteInto, ShaderSize};
|
use encase::{internal::WriteInto, ShaderSize};
|
||||||
|
use indexmap::IndexMap;
|
||||||
use nonmax::NonMaxU32;
|
use nonmax::NonMaxU32;
|
||||||
use tracing::error;
|
use tracing::error;
|
||||||
use wgpu::{BindingResource, BufferUsages, DownlevelFlags, Features};
|
use wgpu::{BindingResource, BufferUsages, DownlevelFlags, Features};
|
||||||
@ -27,11 +28,13 @@ use crate::{
|
|||||||
render_phase::{
|
render_phase::{
|
||||||
BinnedPhaseItem, BinnedRenderPhaseBatch, BinnedRenderPhaseBatchSet,
|
BinnedPhaseItem, BinnedRenderPhaseBatch, BinnedRenderPhaseBatchSet,
|
||||||
BinnedRenderPhaseBatchSets, CachedRenderPipelinePhaseItem, PhaseItem,
|
BinnedRenderPhaseBatchSets, CachedRenderPipelinePhaseItem, PhaseItem,
|
||||||
PhaseItemBatchSetKey as _, PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase,
|
PhaseItemBatchSetKey as _, PhaseItemExtraIndex, RenderBin, SortedPhaseItem,
|
||||||
UnbatchableBinnedEntityIndices, ViewBinnedRenderPhases, ViewSortedRenderPhases,
|
SortedRenderPhase, UnbatchableBinnedEntityIndices, ViewBinnedRenderPhases,
|
||||||
|
ViewSortedRenderPhases,
|
||||||
},
|
},
|
||||||
render_resource::{Buffer, GpuArrayBufferable, RawBufferVec, UninitBufferVec},
|
render_resource::{Buffer, GpuArrayBufferable, RawBufferVec, UninitBufferVec},
|
||||||
renderer::{RenderAdapter, RenderDevice, RenderQueue},
|
renderer::{RenderAdapter, RenderDevice, RenderQueue},
|
||||||
|
sync_world::MainEntity,
|
||||||
view::{ExtractedView, NoIndirectDrawing, RetainedViewEntity},
|
view::{ExtractedView, NoIndirectDrawing, RetainedViewEntity},
|
||||||
Render, RenderApp, RenderDebugFlags, RenderSet,
|
Render, RenderApp, RenderDebugFlags, RenderSet,
|
||||||
};
|
};
|
||||||
@ -902,6 +905,7 @@ impl UntypedPhaseIndirectParametersBuffers {
|
|||||||
/// to are indexed or not. `indirect_parameters_base` specifies the offset
|
/// to are indexed or not. `indirect_parameters_base` specifies the offset
|
||||||
/// within `Self::indexed_data` or `Self::non_indexed_data` of the first
|
/// within `Self::indexed_data` or `Self::non_indexed_data` of the first
|
||||||
/// batch in this batch set.
|
/// batch in this batch set.
|
||||||
|
#[inline]
|
||||||
pub fn add_batch_set(&mut self, indexed: bool, indirect_parameters_base: u32) {
|
pub fn add_batch_set(&mut self, indexed: bool, indirect_parameters_base: u32) {
|
||||||
if indexed {
|
if indexed {
|
||||||
self.indexed.batch_sets.push(IndirectBatchSet {
|
self.indexed.batch_sets.push(IndirectBatchSet {
|
||||||
@ -1472,7 +1476,7 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
|
|||||||
/// Creates batches for a render phase that uses bins.
|
/// Creates batches for a render phase that uses bins.
|
||||||
pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
|
pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
|
||||||
mut phase_batched_instance_buffers: ResMut<PhaseBatchedInstanceBuffers<BPI, GFBD::BufferData>>,
|
mut phase_batched_instance_buffers: ResMut<PhaseBatchedInstanceBuffers<BPI, GFBD::BufferData>>,
|
||||||
mut phase_indirect_parameters_buffers: ResMut<PhaseIndirectParametersBuffers<BPI>>,
|
phase_indirect_parameters_buffers: ResMut<PhaseIndirectParametersBuffers<BPI>>,
|
||||||
mut binned_render_phases: ResMut<ViewBinnedRenderPhases<BPI>>,
|
mut binned_render_phases: ResMut<ViewBinnedRenderPhases<BPI>>,
|
||||||
mut views: Query<
|
mut views: Query<
|
||||||
(
|
(
|
||||||
@ -1489,6 +1493,8 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
|
|||||||
{
|
{
|
||||||
let system_param_item = param.into_inner();
|
let system_param_item = param.into_inner();
|
||||||
|
|
||||||
|
let phase_indirect_parameters_buffers = phase_indirect_parameters_buffers.into_inner();
|
||||||
|
|
||||||
let UntypedPhaseBatchedInstanceBuffers {
|
let UntypedPhaseBatchedInstanceBuffers {
|
||||||
ref mut data_buffer,
|
ref mut data_buffer,
|
||||||
ref mut work_item_buffers,
|
ref mut work_item_buffers,
|
||||||
@ -1519,102 +1525,69 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
|
|||||||
|
|
||||||
// Prepare multidrawables.
|
// Prepare multidrawables.
|
||||||
|
|
||||||
for (batch_set_key, bins) in &phase.multidrawable_meshes {
|
if let (
|
||||||
let mut batch_set = None;
|
&mut BinnedRenderPhaseBatchSets::MultidrawIndirect(ref mut batch_sets),
|
||||||
let indirect_parameters_base = phase_indirect_parameters_buffers
|
&mut PreprocessWorkItemBuffers::Indirect {
|
||||||
.buffers
|
indexed: ref mut indexed_work_item_buffer,
|
||||||
.batch_count(batch_set_key.indexed())
|
non_indexed: ref mut non_indexed_work_item_buffer,
|
||||||
as u32;
|
gpu_occlusion_culling: ref mut gpu_occlusion_culling_buffers,
|
||||||
for (bin_key, bin) in bins {
|
|
||||||
let mut batch: Option<BinnedRenderPhaseBatch> = None;
|
|
||||||
|
|
||||||
for (&main_entity, &input_index) in bin.entities() {
|
|
||||||
let output_index = data_buffer.add() as u32;
|
|
||||||
|
|
||||||
match batch {
|
|
||||||
Some(ref mut batch) => {
|
|
||||||
// Append to the current batch.
|
|
||||||
batch.instance_range.end = output_index + 1;
|
|
||||||
work_item_buffer.push(
|
|
||||||
batch_set_key.indexed(),
|
|
||||||
PreprocessWorkItem {
|
|
||||||
input_index: *input_index,
|
|
||||||
output_or_indirect_parameters_index: match batch.extra_index {
|
|
||||||
PhaseItemExtraIndex::IndirectParametersIndex {
|
|
||||||
ref range,
|
|
||||||
..
|
|
||||||
} => range.start,
|
|
||||||
PhaseItemExtraIndex::DynamicOffset(_)
|
|
||||||
| PhaseItemExtraIndex::None => 0,
|
|
||||||
},
|
},
|
||||||
},
|
) = (&mut phase.batch_sets, &mut *work_item_buffer)
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
None => {
|
|
||||||
// Start a new batch, in indirect mode.
|
|
||||||
let indirect_parameters_index = phase_indirect_parameters_buffers
|
|
||||||
.buffers
|
|
||||||
.allocate(batch_set_key.indexed(), 1);
|
|
||||||
let batch_set_index = phase_indirect_parameters_buffers
|
|
||||||
.buffers
|
|
||||||
.get_next_batch_set_index(batch_set_key.indexed());
|
|
||||||
|
|
||||||
GFBD::write_batch_indirect_parameters_metadata(
|
|
||||||
batch_set_key.indexed(),
|
|
||||||
output_index,
|
|
||||||
batch_set_index,
|
|
||||||
&mut phase_indirect_parameters_buffers.buffers,
|
|
||||||
indirect_parameters_index,
|
|
||||||
);
|
|
||||||
work_item_buffer.push(
|
|
||||||
batch_set_key.indexed(),
|
|
||||||
PreprocessWorkItem {
|
|
||||||
input_index: *input_index,
|
|
||||||
output_or_indirect_parameters_index: indirect_parameters_index,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
batch = Some(BinnedRenderPhaseBatch {
|
|
||||||
representative_entity: (Entity::PLACEHOLDER, main_entity),
|
|
||||||
instance_range: output_index..output_index + 1,
|
|
||||||
extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index(
|
|
||||||
NonMaxU32::new(indirect_parameters_index),
|
|
||||||
),
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(batch) = batch {
|
|
||||||
match batch_set {
|
|
||||||
None => {
|
|
||||||
batch_set = Some(BinnedRenderPhaseBatchSet {
|
|
||||||
first_batch: batch,
|
|
||||||
batch_count: 1,
|
|
||||||
bin_key: bin_key.clone(),
|
|
||||||
index: phase_indirect_parameters_buffers
|
|
||||||
.buffers
|
|
||||||
.batch_set_count(batch_set_key.indexed())
|
|
||||||
as u32,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Some(ref mut batch_set) => {
|
|
||||||
batch_set.batch_count += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let BinnedRenderPhaseBatchSets::MultidrawIndirect(ref mut batch_sets) =
|
|
||||||
phase.batch_sets
|
|
||||||
{
|
{
|
||||||
if let Some(batch_set) = batch_set {
|
let mut output_index = data_buffer.len() as u32;
|
||||||
batch_sets.push(batch_set);
|
|
||||||
|
// Initialize the state for both indexed and non-indexed meshes.
|
||||||
|
let mut indexed_preparer: MultidrawableBatchSetPreparer<BPI, GFBD> =
|
||||||
|
MultidrawableBatchSetPreparer::new(
|
||||||
|
phase_indirect_parameters_buffers.buffers.batch_count(true) as u32,
|
||||||
phase_indirect_parameters_buffers
|
phase_indirect_parameters_buffers
|
||||||
.buffers
|
.buffers
|
||||||
.add_batch_set(batch_set_key.indexed(), indirect_parameters_base);
|
.indexed
|
||||||
|
.batch_sets
|
||||||
|
.len() as u32,
|
||||||
|
);
|
||||||
|
let mut non_indexed_preparer: MultidrawableBatchSetPreparer<BPI, GFBD> =
|
||||||
|
MultidrawableBatchSetPreparer::new(
|
||||||
|
phase_indirect_parameters_buffers.buffers.batch_count(false) as u32,
|
||||||
|
phase_indirect_parameters_buffers
|
||||||
|
.buffers
|
||||||
|
.non_indexed
|
||||||
|
.batch_sets
|
||||||
|
.len() as u32,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Prepare each batch set.
|
||||||
|
for (batch_set_key, bins) in &phase.multidrawable_meshes {
|
||||||
|
if batch_set_key.indexed() {
|
||||||
|
indexed_preparer.prepare_multidrawable_binned_batch_set(
|
||||||
|
bins,
|
||||||
|
&mut output_index,
|
||||||
|
data_buffer,
|
||||||
|
indexed_work_item_buffer,
|
||||||
|
&mut phase_indirect_parameters_buffers.buffers.indexed,
|
||||||
|
batch_sets,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
non_indexed_preparer.prepare_multidrawable_binned_batch_set(
|
||||||
|
bins,
|
||||||
|
&mut output_index,
|
||||||
|
data_buffer,
|
||||||
|
non_indexed_work_item_buffer,
|
||||||
|
&mut phase_indirect_parameters_buffers.buffers.non_indexed,
|
||||||
|
batch_sets,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reserve space in the occlusion culling buffers, if necessary.
|
||||||
|
if let Some(ref mut gpu_occlusion_culling_buffers) = gpu_occlusion_culling_buffers {
|
||||||
|
gpu_occlusion_culling_buffers
|
||||||
|
.late_indexed
|
||||||
|
.add_multiple(indexed_preparer.work_item_count);
|
||||||
|
gpu_occlusion_culling_buffers
|
||||||
|
.late_non_indexed
|
||||||
|
.add_multiple(non_indexed_preparer.work_item_count);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Prepare batchables.
|
// Prepare batchables.
|
||||||
@ -1814,6 +1787,137 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The state that [`batch_and_prepare_binned_render_phase`] uses to construct
|
||||||
|
/// multidrawable batch sets.
|
||||||
|
///
|
||||||
|
/// The [`batch_and_prepare_binned_render_phase`] system maintains two of these:
|
||||||
|
/// one for indexed meshes and one for non-indexed meshes.
|
||||||
|
struct MultidrawableBatchSetPreparer<BPI, GFBD>
|
||||||
|
where
|
||||||
|
BPI: BinnedPhaseItem,
|
||||||
|
GFBD: GetFullBatchData,
|
||||||
|
{
|
||||||
|
/// The offset in the indirect parameters buffer at which the next indirect
|
||||||
|
/// parameters will be written.
|
||||||
|
indirect_parameters_index: u32,
|
||||||
|
/// The number of batch sets we've built so far for this mesh class.
|
||||||
|
batch_set_index: u32,
|
||||||
|
/// The number of work items we've emitted so far for this mesh class.
|
||||||
|
work_item_count: usize,
|
||||||
|
phantom: PhantomData<(BPI, GFBD)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<BPI, GFBD> MultidrawableBatchSetPreparer<BPI, GFBD>
|
||||||
|
where
|
||||||
|
BPI: BinnedPhaseItem,
|
||||||
|
GFBD: GetFullBatchData,
|
||||||
|
{
|
||||||
|
/// Creates a new [`MultidrawableBatchSetPreparer`] that will start writing
|
||||||
|
/// indirect parameters and batch sets at the given indices.
|
||||||
|
#[inline]
|
||||||
|
fn new(initial_indirect_parameters_index: u32, initial_batch_set_index: u32) -> Self {
|
||||||
|
MultidrawableBatchSetPreparer {
|
||||||
|
indirect_parameters_index: initial_indirect_parameters_index,
|
||||||
|
batch_set_index: initial_batch_set_index,
|
||||||
|
work_item_count: 0,
|
||||||
|
phantom: PhantomData,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates batch sets and writes the GPU data needed to draw all visible
|
||||||
|
/// entities of one mesh class in the given batch set.
|
||||||
|
///
|
||||||
|
/// The *mesh class* represents whether the mesh has indices or not.
|
||||||
|
#[inline]
|
||||||
|
fn prepare_multidrawable_binned_batch_set<IP>(
|
||||||
|
&mut self,
|
||||||
|
bins: &IndexMap<BPI::BinKey, RenderBin>,
|
||||||
|
output_index: &mut u32,
|
||||||
|
data_buffer: &mut UninitBufferVec<GFBD::BufferData>,
|
||||||
|
indexed_work_item_buffer: &mut RawBufferVec<PreprocessWorkItem>,
|
||||||
|
mesh_class_buffers: &mut MeshClassIndirectParametersBuffers<IP>,
|
||||||
|
batch_sets: &mut Vec<BinnedRenderPhaseBatchSet<BPI::BinKey>>,
|
||||||
|
) where
|
||||||
|
IP: Clone + ShaderSize + WriteInto,
|
||||||
|
{
|
||||||
|
let current_indexed_batch_set_index = self.batch_set_index;
|
||||||
|
let current_output_index = *output_index;
|
||||||
|
|
||||||
|
let indirect_parameters_base = self.indirect_parameters_index;
|
||||||
|
|
||||||
|
// We're going to write the first entity into the batch set. Do this
|
||||||
|
// here so that we can preload the bin into cache as a side effect.
|
||||||
|
let Some((first_bin_key, first_bin)) = bins.iter().next() else {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
let first_bin_len = first_bin.entities().len();
|
||||||
|
let first_bin_entity = first_bin
|
||||||
|
.entities()
|
||||||
|
.keys()
|
||||||
|
.next()
|
||||||
|
.copied()
|
||||||
|
.unwrap_or(MainEntity::from(Entity::PLACEHOLDER));
|
||||||
|
|
||||||
|
// Traverse the batch set, processing each bin.
|
||||||
|
for bin in bins.values() {
|
||||||
|
// Record the first output index for this batch, as well as its own
|
||||||
|
// index.
|
||||||
|
mesh_class_buffers
|
||||||
|
.cpu_metadata
|
||||||
|
.push(IndirectParametersCpuMetadata {
|
||||||
|
base_output_index: *output_index,
|
||||||
|
batch_set_index: self.batch_set_index,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Traverse the bin, pushing `PreprocessWorkItem`s for each entity
|
||||||
|
// within it. This is a hot loop, so make it as fast as possible.
|
||||||
|
for &input_index in bin.entities().values() {
|
||||||
|
indexed_work_item_buffer.push(PreprocessWorkItem {
|
||||||
|
input_index: *input_index,
|
||||||
|
output_or_indirect_parameters_index: self.indirect_parameters_index,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reserve space for the appropriate number of entities in the data
|
||||||
|
// buffer. Also, advance the output index and work item count.
|
||||||
|
let bin_entity_count = bin.entities().len();
|
||||||
|
data_buffer.add_multiple(bin_entity_count);
|
||||||
|
*output_index += bin_entity_count as u32;
|
||||||
|
self.work_item_count += bin_entity_count;
|
||||||
|
|
||||||
|
self.indirect_parameters_index += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reserve space for the bins in this batch set in the GPU buffers.
|
||||||
|
let bin_count = bins.len();
|
||||||
|
mesh_class_buffers.gpu_metadata.add_multiple(bin_count);
|
||||||
|
mesh_class_buffers.data.add_multiple(bin_count);
|
||||||
|
|
||||||
|
// Write the information the GPU will need about this batch set.
|
||||||
|
mesh_class_buffers.batch_sets.push(IndirectBatchSet {
|
||||||
|
indirect_parameters_base,
|
||||||
|
indirect_parameters_count: 0,
|
||||||
|
});
|
||||||
|
|
||||||
|
self.batch_set_index += 1;
|
||||||
|
|
||||||
|
// Record the batch set. The render node later processes this record to
|
||||||
|
// render the batches.
|
||||||
|
batch_sets.push(BinnedRenderPhaseBatchSet {
|
||||||
|
first_batch: BinnedRenderPhaseBatch {
|
||||||
|
representative_entity: (Entity::PLACEHOLDER, first_bin_entity),
|
||||||
|
instance_range: current_output_index..(current_output_index + first_bin_len as u32),
|
||||||
|
extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index(NonMaxU32::new(
|
||||||
|
indirect_parameters_base,
|
||||||
|
)),
|
||||||
|
},
|
||||||
|
bin_key: (*first_bin_key).clone(),
|
||||||
|
batch_count: self.indirect_parameters_index - indirect_parameters_base,
|
||||||
|
index: current_indexed_batch_set_index,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// A system that gathers up the per-phase GPU buffers and inserts them into the
|
/// A system that gathers up the per-phase GPU buffers and inserts them into the
|
||||||
/// [`BatchedInstanceBuffers`] and [`IndirectParametersBuffers`] tables.
|
/// [`BatchedInstanceBuffers`] and [`IndirectParametersBuffers`] tables.
|
||||||
///
|
///
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user