Cache MeshInputUniform indices in each RenderBin. (#17772)

Currently, we look up each `MeshInputUniform` index in a hash table that
maps the main entity ID to the index every frame. This is inefficient,
cache unfriendly, and unnecessary, as the `MeshInputUniform` index for
an entity remains the same from frame to frame (even if the input
uniform changes). This commit changes the `IndexSet` in the `RenderBin`
to an `IndexMap` that maps the `MainEntity` to `MeshInputUniformIndex`
(a new type that this patch adds for more type safety).

On Caldera with parallel `batch_and_prepare_binned_render_phase`, this
patch improves that function from 3.18 ms to 2.42 ms, a 31% speedup.
This commit is contained in:
Patrick Walton 2025-02-11 14:38:52 -08:00 committed by GitHub
parent ce433955e6
commit 85b366a8a2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 99 additions and 61 deletions

View File

@ -1002,6 +1002,7 @@ pub fn queue_material_meshes<M: Material>(
batch_set_key,
bin_key,
(*render_entity, *visible_entity),
mesh_instance.current_uniform_index,
BinnedRenderPhaseType::mesh(
mesh_instance.should_batch(),
&gpu_preprocessing_support,
@ -1025,6 +1026,7 @@ pub fn queue_material_meshes<M: Material>(
batch_set_key,
bin_key,
(*render_entity, *visible_entity),
mesh_instance.current_uniform_index,
BinnedRenderPhaseType::mesh(
mesh_instance.should_batch(),
&gpu_preprocessing_support,

View File

@ -1145,6 +1145,7 @@ pub fn queue_prepass_material_meshes<M: Material>(
asset_id: mesh_instance.mesh_asset_id.into(),
},
(*render_entity, *visible_entity),
mesh_instance.current_uniform_index,
BinnedRenderPhaseType::mesh(
mesh_instance.should_batch(),
&gpu_preprocessing_support,
@ -1169,6 +1170,7 @@ pub fn queue_prepass_material_meshes<M: Material>(
asset_id: mesh_instance.mesh_asset_id.into(),
},
(*render_entity, *visible_entity),
mesh_instance.current_uniform_index,
BinnedRenderPhaseType::mesh(
mesh_instance.should_batch(),
&gpu_preprocessing_support,
@ -1195,6 +1197,7 @@ pub fn queue_prepass_material_meshes<M: Material>(
batch_set_key,
bin_key,
(*render_entity, *visible_entity),
mesh_instance.current_uniform_index,
BinnedRenderPhaseType::mesh(
mesh_instance.should_batch(),
&gpu_preprocessing_support,
@ -1218,6 +1221,7 @@ pub fn queue_prepass_material_meshes<M: Material>(
batch_set_key,
bin_key,
(*render_entity, *visible_entity),
mesh_instance.current_uniform_index,
BinnedRenderPhaseType::mesh(
mesh_instance.should_batch(),
&gpu_preprocessing_support,

View File

@ -1946,6 +1946,7 @@ pub fn queue_shadows<M: Material>(
asset_id: mesh_instance.mesh_asset_id.into(),
},
(entity, main_entity),
mesh_instance.current_uniform_index,
BinnedRenderPhaseType::mesh(
mesh_instance.should_batch(),
&gpu_preprocessing_support,

View File

@ -30,7 +30,7 @@ use bevy_render::{
primitives::Aabb,
render_asset::RenderAssets,
render_phase::{
BinnedRenderPhasePlugin, PhaseItem, PhaseItemExtraIndex, RenderCommand,
BinnedRenderPhasePlugin, InputUniformIndex, PhaseItem, PhaseItemExtraIndex, RenderCommand,
RenderCommandResult, SortedRenderPhasePlugin, TrackedRenderPass,
},
render_resource::*,
@ -958,6 +958,7 @@ impl RenderMeshInstancesCpu {
.map(|render_mesh_instance| RenderMeshQueueData {
shared: &render_mesh_instance.shared,
translation: render_mesh_instance.transforms.world_from_local.translation,
current_uniform_index: InputUniformIndex::default(),
})
}
@ -981,6 +982,9 @@ impl RenderMeshInstancesGpu {
.map(|render_mesh_instance| RenderMeshQueueData {
shared: &render_mesh_instance.shared,
translation: render_mesh_instance.translation,
current_uniform_index: InputUniformIndex(
render_mesh_instance.current_uniform_index.into(),
),
})
}
@ -1281,6 +1285,9 @@ pub struct RenderMeshQueueData<'a> {
pub shared: &'a RenderMeshInstanceShared,
/// The translation of the mesh instance.
pub translation: Vec3,
/// The index of the [`MeshInputUniform`] in the GPU buffer for this mesh
/// instance.
pub current_uniform_index: InputUniformIndex,
}
/// A [`SystemSet`] that encompasses both [`extract_meshes_for_cpu_building`]
@ -1945,7 +1952,7 @@ impl GetFullBatchData for MeshPipeline {
}
fn write_batch_indirect_parameters_metadata(
mesh_index: u32,
mesh_index: InputUniformIndex,
indexed: bool,
base_output_index: u32,
batch_set_index: Option<NonMaxU32>,
@ -1953,7 +1960,7 @@ impl GetFullBatchData for MeshPipeline {
indirect_parameters_offset: u32,
) {
let indirect_parameters = IndirectParametersMetadata {
mesh_index,
mesh_index: *mesh_index,
base_output_index,
batch_set_index: match batch_set_index {
Some(batch_set_index) => u32::from(batch_set_index),

View File

@ -24,9 +24,9 @@ use crate::{
experimental::occlusion_culling::OcclusionCulling,
render_phase::{
BinnedPhaseItem, BinnedRenderPhaseBatch, BinnedRenderPhaseBatchSet,
BinnedRenderPhaseBatchSets, CachedRenderPipelinePhaseItem, PhaseItemBatchSetKey as _,
PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase, UnbatchableBinnedEntityIndices,
ViewBinnedRenderPhases, ViewSortedRenderPhases,
BinnedRenderPhaseBatchSets, CachedRenderPipelinePhaseItem, InputUniformIndex,
PhaseItemBatchSetKey as _, PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase,
UnbatchableBinnedEntityIndices, ViewBinnedRenderPhases, ViewSortedRenderPhases,
},
render_resource::{Buffer, BufferVec, GpuArrayBufferable, RawBufferVec, UninitBufferVec},
renderer::{RenderAdapter, RenderDevice, RenderQueue},
@ -1271,7 +1271,7 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
// Start a new batch.
if let Some(indirect_parameters_index) = indirect_parameters_index {
GFBD::write_batch_indirect_parameters_metadata(
current_input_index.into(),
InputUniformIndex(current_input_index.into()),
item_is_indexed,
output_index,
None,
@ -1382,12 +1382,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
let first_output_index = data_buffer.len() as u32;
let mut batch: Option<BinnedRenderPhaseBatch> = None;
for main_entity in bin.entities() {
let Some(input_index) =
GFBD::get_binned_index(&system_param_item, *main_entity)
else {
continue;
};
for (&main_entity, &input_index) in bin.entities() {
let output_index = data_buffer.add() as u32;
match batch {
@ -1397,7 +1392,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
work_item_buffer.push(
batch_set_key.indexed(),
PreprocessWorkItem {
input_index: input_index.into(),
input_index: *input_index,
output_index: first_output_index,
indirect_parameters_index: match batch.extra_index {
PhaseItemExtraIndex::IndirectParametersIndex {
@ -1419,7 +1414,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
.get_next_batch_set_index(batch_set_key.indexed());
GFBD::write_batch_indirect_parameters_metadata(
input_index.into(),
input_index,
batch_set_key.indexed(),
output_index,
batch_set_index,
@ -1429,13 +1424,13 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
work_item_buffer.push(
batch_set_key.indexed(),
PreprocessWorkItem {
input_index: input_index.into(),
input_index: *input_index,
output_index: first_output_index,
indirect_parameters_index,
},
);
batch = Some(BinnedRenderPhaseBatch {
representative_entity: (Entity::PLACEHOLDER, *main_entity),
representative_entity: (Entity::PLACEHOLDER, main_entity),
instance_range: output_index..output_index + 1,
extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index(
NonMaxU32::new(indirect_parameters_index),
@ -1481,11 +1476,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
let first_output_index = data_buffer.len() as u32;
let mut batch: Option<BinnedRenderPhaseBatch> = None;
for main_entity in phase.batchable_mesh_values[key].entities() {
let Some(input_index) = GFBD::get_binned_index(&system_param_item, *main_entity)
else {
continue;
};
for (&main_entity, &input_index) in phase.batchable_mesh_values[key].entities() {
let output_index = data_buffer.add() as u32;
match batch {
@ -1502,7 +1493,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
work_item_buffer.push(
key.0.indexed(),
PreprocessWorkItem {
input_index: input_index.into(),
input_index: *input_index,
output_index: if no_indirect_drawing {
output_index
} else {
@ -1528,7 +1519,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
indirect_parameters_buffers.get_next_batch_set_index(key.0.indexed());
GFBD::write_batch_indirect_parameters_metadata(
input_index.into(),
input_index,
key.0.indexed(),
output_index,
batch_set_index,
@ -1538,13 +1529,13 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
work_item_buffer.push(
key.0.indexed(),
PreprocessWorkItem {
input_index: input_index.into(),
input_index: *input_index,
output_index: first_output_index,
indirect_parameters_index,
},
);
batch = Some(BinnedRenderPhaseBatch {
representative_entity: (Entity::PLACEHOLDER, *main_entity),
representative_entity: (Entity::PLACEHOLDER, main_entity),
instance_range: output_index..output_index + 1,
extra_index: PhaseItemExtraIndex::IndirectParametersIndex {
range: indirect_parameters_index..(indirect_parameters_index + 1),
@ -1558,13 +1549,13 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
work_item_buffer.push(
key.0.indexed(),
PreprocessWorkItem {
input_index: input_index.into(),
input_index: *input_index,
output_index,
indirect_parameters_index: 0,
},
);
batch = Some(BinnedRenderPhaseBatch {
representative_entity: (Entity::PLACEHOLDER, *main_entity),
representative_entity: (Entity::PLACEHOLDER, main_entity),
instance_range: output_index..output_index + 1,
extra_index: PhaseItemExtraIndex::None,
});
@ -1627,7 +1618,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
// We're in indirect mode, so add an indirect parameters
// index.
GFBD::write_batch_indirect_parameters_metadata(
input_index.into(),
InputUniformIndex(input_index.into()),
key.0.indexed(),
output_index,
None,

View File

@ -7,7 +7,6 @@ use bytemuck::Pod;
use nonmax::NonMaxU32;
use self::gpu_preprocessing::IndirectParametersBuffers;
use crate::{render_phase::PhaseItemExtraIndex, sync_world::MainEntity};
use crate::{
render_phase::{
BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, SortedPhaseItem,
@ -15,6 +14,10 @@ use crate::{
},
render_resource::{CachedRenderPipelineId, GpuArrayBufferable},
};
use crate::{
render_phase::{InputUniformIndex, PhaseItemExtraIndex},
sync_world::MainEntity,
};
pub mod gpu_preprocessing;
pub mod no_gpu_preprocessing;
@ -132,12 +135,17 @@ pub trait GetFullBatchData: GetBatchData {
) -> Option<(NonMaxU32, Option<Self::CompareData>)>;
/// Returns the index of the [`GetFullBatchData::BufferInputData`] that the
/// GPU preprocessing phase will use, for the binning path.
/// GPU preprocessing phase will use.
///
/// We already inserted the [`GetFullBatchData::BufferInputData`] during the
/// extraction phase before we got here, so this function shouldn't need to
/// look up any render data. If CPU instance buffer building is in use, this
/// function will never be called.
/// look up any render data.
///
/// This function is currently only called for unbatchable entities when GPU
/// instance buffer building is in use. For batchable entities, the uniform
/// index is written during queuing (e.g. in `queue_material_meshes`). In
/// the case of CPU instance buffer building, the CPU writes the uniforms,
/// so there's no index to return.
fn get_binned_index(
param: &SystemParamItem<Self::Param>,
query_item: MainEntity,
@ -167,7 +175,7 @@ pub trait GetFullBatchData: GetBatchData {
/// * `indirect_parameters_offset` is the index in that buffer at which to
/// write the metadata.
fn write_batch_indirect_parameters_metadata(
mesh_index: u32,
mesh_index: InputUniformIndex,
indexed: bool,
base_output_index: u32,
batch_set_index: Option<NonMaxU32>,

View File

@ -110,7 +110,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
for key in &phase.batchable_mesh_keys {
let mut batch_set: SmallVec<[BinnedRenderPhaseBatch; 1]> = smallvec![];
for main_entity in phase.batchable_mesh_values[key].entities() {
for main_entity in phase.batchable_mesh_values[key].entities().keys() {
let Some(buffer_data) =
GFBD::get_binned_batch_data(&system_param_item, *main_entity)
else {

View File

@ -38,7 +38,7 @@ pub use draw::*;
pub use draw_state::*;
use encase::{internal::WriteInto, ShaderSize};
use fixedbitset::{Block, FixedBitSet};
use indexmap::{IndexMap, IndexSet};
use indexmap::IndexMap;
use nonmax::NonMaxU32;
pub use rangefinder::*;
use wgpu::Features;
@ -191,8 +191,9 @@ where
/// a [`BinnedRenderPhase`].
#[derive(Default)]
pub struct RenderBin {
/// A list of the entities in each bin.
entities: IndexSet<MainEntity, EntityHash>,
/// A list of the entities in each bin, along with their cached
/// [`InputUniformIndex`].
entities: IndexMap<MainEntity, InputUniformIndex, EntityHash>,
}
/// Information that we track about an entity that was in one bin on the
@ -422,6 +423,19 @@ where
}
}
/// The index of the uniform describing this object in the GPU buffer, when GPU
/// preprocessing is enabled.
///
/// For example, for 3D meshes, this is the index of the `MeshInputUniform` in
/// the buffer.
///
/// This field is ignored if GPU preprocessing isn't in use, such as (currently)
/// in the case of 2D meshes. In that case, it can be safely set to
/// [`core::default::Default::default`].
#[derive(Clone, Copy, PartialEq, Default, Deref, DerefMut)]
#[repr(transparent)]
pub struct InputUniformIndex(pub u32);
impl<BPI> BinnedRenderPhase<BPI>
where
BPI: BinnedPhaseItem,
@ -436,6 +450,7 @@ where
batch_set_key: BPI::BatchSetKey,
bin_key: BPI::BinKey,
(entity, main_entity): (Entity, MainEntity),
input_uniform_index: InputUniformIndex,
phase_type: BinnedRenderPhaseType,
change_tick: Tick,
) {
@ -447,11 +462,14 @@ where
.get_mut()
.entry(bin_key.clone())
.or_default()
.insert(main_entity);
.insert(main_entity, input_uniform_index);
}
Entry::Vacant(entry) => {
let mut new_batch_set = HashMap::default();
new_batch_set.insert(bin_key.clone(), RenderBin::from_entity(main_entity));
new_batch_set.insert(
bin_key.clone(),
RenderBin::from_entity(main_entity, input_uniform_index),
);
entry.insert(new_batch_set);
}
}
@ -463,10 +481,10 @@ where
.entry((batch_set_key.clone(), bin_key.clone()).clone())
{
Entry::Occupied(mut entry) => {
entry.get_mut().insert(main_entity);
entry.get_mut().insert(main_entity, input_uniform_index);
}
Entry::Vacant(entry) => {
entry.insert(RenderBin::from_entity(main_entity));
entry.insert(RenderBin::from_entity(main_entity, input_uniform_index));
}
}
}
@ -497,10 +515,10 @@ where
.entry((batch_set_key.clone(), bin_key.clone()).clone())
{
Entry::Occupied(mut entry) => {
entry.get_mut().insert(main_entity);
entry.get_mut().insert(main_entity, input_uniform_index);
}
Entry::Vacant(entry) => {
entry.insert(RenderBin::from_entity(main_entity));
entry.insert(RenderBin::from_entity(main_entity, input_uniform_index));
}
}
}
@ -753,7 +771,7 @@ where
let mut draw_functions = draw_functions.write();
for ((batch_set_key, bin_key), bin) in &self.non_mesh_items {
for &entity in &bin.entities {
for &entity in bin.entities.keys() {
// Come up with a fake batch range and extra index. The draw
// function is expected to manage any sort of batching logic itself.
let binned_phase_item = BPI::new(
@ -1631,15 +1649,15 @@ impl BinnedRenderPhaseType {
impl RenderBin {
/// Creates a [`RenderBin`] containing a single entity.
fn from_entity(entity: MainEntity) -> RenderBin {
let mut entities = IndexSet::default();
entities.insert(entity);
fn from_entity(entity: MainEntity, uniform_index: InputUniformIndex) -> RenderBin {
let mut entities = IndexMap::default();
entities.insert(entity, uniform_index);
RenderBin { entities }
}
/// Inserts an entity into the bin.
fn insert(&mut self, entity: MainEntity) {
self.entities.insert(entity);
fn insert(&mut self, entity: MainEntity, uniform_index: InputUniformIndex) {
self.entities.insert(entity, uniform_index);
}
/// Removes an entity from the bin.
@ -1652,9 +1670,10 @@ impl RenderBin {
self.entities.is_empty()
}
/// Returns the [`IndexSet`] containing all the entities in the bin.
/// Returns the [`IndexMap`] containing all the entities in the bin, along
/// with the cached [`InputUniformIndex`] of each.
#[inline]
pub fn entities(&self) -> &IndexSet<MainEntity, EntityHash> {
pub fn entities(&self) -> &IndexMap<MainEntity, InputUniformIndex, EntityHash> {
&self.entities
}
}

View File

@ -22,7 +22,7 @@ use bevy_ecs::{
use bevy_math::FloatOrd;
use bevy_platform_support::collections::HashMap;
use bevy_reflect::{prelude::ReflectDefault, Reflect};
use bevy_render::render_phase::DrawFunctionId;
use bevy_render::render_phase::{DrawFunctionId, InputUniformIndex};
use bevy_render::render_resource::CachedRenderPipelineId;
use bevy_render::view::RenderVisibleEntities;
use bevy_render::{
@ -809,6 +809,7 @@ pub fn queue_material2d_meshes<M: Material2d>(
},
bin_key,
(*render_entity, *visible_entity),
InputUniformIndex::default(),
binned_render_phase_type,
current_change_tick,
);
@ -826,6 +827,7 @@ pub fn queue_material2d_meshes<M: Material2d>(
},
bin_key,
(*render_entity, *visible_entity),
InputUniformIndex::default(),
binned_render_phase_type,
current_change_tick,
);

View File

@ -1,5 +1,6 @@
use bevy_app::Plugin;
use bevy_asset::{load_internal_asset, weak_handle, AssetId, Handle};
use bevy_render::render_phase::InputUniformIndex;
use crate::{tonemapping_pipeline_key, Material2dBindGroupId};
use bevy_core_pipeline::tonemapping::DebandDither;
@ -474,7 +475,7 @@ impl GetFullBatchData for Mesh2dPipeline {
}
fn write_batch_indirect_parameters_metadata(
input_index: u32,
input_index: InputUniformIndex,
indexed: bool,
base_output_index: u32,
batch_set_index: Option<NonMaxU32>,
@ -485,7 +486,7 @@ impl GetFullBatchData for Mesh2dPipeline {
// though they actually have distinct layouts. See the comment above that
// type for more information.
let indirect_parameters = IndirectParametersMetadata {
mesh_index: input_index,
mesh_index: *input_index,
base_output_index,
batch_set_index: match batch_set_index {
None => !0,

View File

@ -19,8 +19,9 @@ use bevy::{
extract_component::{ExtractComponent, ExtractComponentPlugin},
primitives::Aabb,
render_phase::{
AddRenderCommand, BinnedRenderPhaseType, DrawFunctions, PhaseItem, RenderCommand,
RenderCommandResult, SetItemPipeline, TrackedRenderPass, ViewBinnedRenderPhases,
AddRenderCommand, BinnedRenderPhaseType, DrawFunctions, InputUniformIndex, PhaseItem,
RenderCommand, RenderCommandResult, SetItemPipeline, TrackedRenderPass,
ViewBinnedRenderPhases,
},
render_resource::{
BufferUsages, ColorTargetState, ColorWrites, CompareFunction, DepthStencilState,
@ -277,6 +278,7 @@ fn queue_custom_phase_item(
asset_id: AssetId::<Mesh>::invalid().untyped(),
},
entity,
InputUniformIndex::default(),
BinnedRenderPhaseType::NonMesh,
*next_tick,
);

View File

@ -41,8 +41,8 @@ use bevy::{
},
render_phase::{
sort_phase_system, AddRenderCommand, CachedRenderPipelinePhaseItem, DrawFunctionId,
DrawFunctions, PhaseItem, PhaseItemExtraIndex, SetItemPipeline, SortedPhaseItem,
ViewSortedRenderPhases,
DrawFunctions, InputUniformIndex, PhaseItem, PhaseItemExtraIndex, SetItemPipeline,
SortedPhaseItem, ViewSortedRenderPhases,
},
render_resource::{
CachedRenderPipelineId, ColorTargetState, ColorWrites, Face, FragmentState, FrontFace,
@ -431,7 +431,7 @@ impl GetFullBatchData for StencilPipeline {
}
fn write_batch_indirect_parameters_metadata(
mesh_index: u32,
mesh_index: InputUniformIndex,
indexed: bool,
base_output_index: u32,
batch_set_index: Option<NonMaxU32>,
@ -442,7 +442,7 @@ impl GetFullBatchData for StencilPipeline {
// though they actually have distinct layouts. See the comment above that
// type for more information.
let indirect_parameters = IndirectParametersMetadata {
mesh_index,
mesh_index: *mesh_index,
base_output_index,
batch_set_index: match batch_set_index {
None => !0,

View File

@ -427,6 +427,7 @@ fn queue_custom_mesh_pipeline(
asset_id: AssetId::<Mesh>::invalid().untyped(),
},
(render_entity, visible_entity),
mesh_instance.current_uniform_index,
// This example supports batching, but if your pipeline doesn't
// support it you can use `BinnedRenderPhaseType::UnbatchableMesh`
BinnedRenderPhaseType::BatchableMesh,