Refactor batch_and_prepare_binned_render_phase in preparation for bin retention. (#16922)

This commit makes the following changes: * `IndirectParametersBuffer` has been changed from a `BufferVec` to a `RawBufferVec`. This won about 20us or so on Bistro by avoiding `encase` overhead. * The methods on the `GetFullBatchData` trait no longer have the `entity` parameter, as it was unused. * `PreprocessWorkItem`, which specifies a transform-and-cull operation, now supplies the mesh instance uniform output index directly instead of having the shader look it up from the indirect draw parameters. Accordingly, the responsibility of writing the output index to the indirect draw parameters has been moved from the CPU to the GPU. This is in preparation for retained indirect instance draw commands, where the mesh instance uniform output index may change from frame to frame, while the indirect instance draw commands will be cached. We won't want the CPU to have to upload the same indirect draw parameters again and again if a batch didn't change from frame to frame. * `batch_and_prepare_binned_render_phase` and `batch_and_prepare_sorted_render_phase` now allocate indirect draw commands for an entire batch set at a time when possible, instead of one batch at a time. This change will allow us to retain the indirect draw commands for whole batch sets. * `GetFullBatchData::get_batch_indirect_parameters_index` has been replaced with `GetFullBatchData::write_batch_indirect_parameters`, which takes an offset and writes into it instead of allocating. This is necessary in order to use the optimization mentioned in the previous point. * At the WGSL level, `IndirectParameters` has been factored out into `mesh_preprocess_types.wgsl`. This is because we'll need a new compute shader that zeroes out the instance counts in preparation for a new frame. That shader will need to access `IndirectParameters`, so it was moved to a separate file. * Bins are no longer raw vectors but are instances of a separate type, `RenderBin`. This is so that the bin can eventually contain its retained batches.
2024-12-30 14:11:31 -06:00 · 2024-12-30 14:11:31 -06:00 · 7767a8d161
commit 7767a8d161
parent fde7968168
10 changed files with 293 additions and 138 deletions
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@ -47,6 +47,9 @@ use crate::{
 /// The handle to the `mesh_preprocess.wgsl` compute shader.
 pub const MESH_PREPROCESS_SHADER_HANDLE: Handle<Shader> =
    Handle::weak_from_u128(16991728318640779533);
+/// The handle to the `mesh_preprocess_types.wgsl` compute shader.
+pub const MESH_PREPROCESS_TYPES_SHADER_HANDLE: Handle<Shader> =
+    Handle::weak_from_u128(2720440370122465935);

 /// The GPU workgroup size.
 const WORKGROUP_SIZE: usize = 64;
@ -127,6 +130,12 @@ impl Plugin for GpuMeshPreprocessPlugin {
            "mesh_preprocess.wgsl",
            Shader::from_wgsl
        );
+        load_internal_asset!(
+            app,
+            MESH_PREPROCESS_TYPES_SHADER_HANDLE,
+            "mesh_preprocess_types.wgsl",
+            Shader::from_wgsl
+        );
    }

    fn finish(&self, app: &mut App) {
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@ -1631,7 +1631,7 @@ impl GetFullBatchData for MeshPipeline {

    fn get_index_and_compare_data(
        (mesh_instances, lightmaps, _, _, _): &SystemParamItem<Self::Param>,
-        (_entity, main_entity): (Entity, MainEntity),
+        main_entity: MainEntity,
    ) -> Option<(NonMaxU32, Option<Self::CompareData>)> {
        // This should only be called during GPU building.
        let RenderMeshInstances::GpuBuilding(ref mesh_instances) = **mesh_instances else {
@ -1657,7 +1657,7 @@ impl GetFullBatchData for MeshPipeline {

    fn get_binned_batch_data(
        (mesh_instances, lightmaps, _, mesh_allocator, skin_indices): &SystemParamItem<Self::Param>,
-        (_entity, main_entity): (Entity, MainEntity),
+        main_entity: MainEntity,
    ) -> Option<Self::BufferData> {
        let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
            error!(
@ -1688,7 +1688,7 @@ impl GetFullBatchData for MeshPipeline {

    fn get_binned_index(
        (mesh_instances, _, _, _, _): &SystemParamItem<Self::Param>,
-        (_entity, main_entity): (Entity, MainEntity),
+        main_entity: MainEntity,
    ) -> Option<NonMaxU32> {
        // This should only be called during GPU building.
        let RenderMeshInstances::GpuBuilding(ref mesh_instances) = **mesh_instances else {
@ -1704,46 +1704,53 @@ impl GetFullBatchData for MeshPipeline {
            .map(|entity| entity.current_uniform_index)
    }

-    fn get_batch_indirect_parameters_index(
+    fn write_batch_indirect_parameters(
        (mesh_instances, _, meshes, mesh_allocator, _): &SystemParamItem<Self::Param>,
        indirect_parameters_buffer: &mut IndirectParametersBuffer,
-        entity: (Entity, MainEntity),
-        instance_index: u32,
-    ) -> Option<NonMaxU32> {
-        get_batch_indirect_parameters_index(
+        indirect_parameters_offset: u32,
+        main_entity: MainEntity,
+    ) {
+        write_batch_indirect_parameters(
            mesh_instances,
            meshes,
            mesh_allocator,
            indirect_parameters_buffer,
-            entity,
-            instance_index,
-        )
+            indirect_parameters_offset,
+            main_entity,
+        );
    }
 }

 /// Pushes a set of [`IndirectParameters`] onto the [`IndirectParametersBuffer`]
 /// for the given mesh instance, and returns the index of those indirect
 /// parameters.
-fn get_batch_indirect_parameters_index(
+fn write_batch_indirect_parameters(
    mesh_instances: &RenderMeshInstances,
    meshes: &RenderAssets<RenderMesh>,
    mesh_allocator: &MeshAllocator,
    indirect_parameters_buffer: &mut IndirectParametersBuffer,
-    (_entity, main_entity): (Entity, MainEntity),
-    instance_index: u32,
-) -> Option<NonMaxU32> {
+    indirect_parameters_offset: u32,
+    main_entity: MainEntity,
+) {
    // This should only be called during GPU building.
    let RenderMeshInstances::GpuBuilding(ref mesh_instances) = *mesh_instances else {
        error!(
-            "`get_batch_indirect_parameters_index` should never be called in CPU mesh uniform \
+            "`write_batch_indirect_parameters_index` should never be called in CPU mesh uniform \
                building mode"
        );
-        return None;
+        return;
    };

-    let mesh_instance = mesh_instances.get(&main_entity)?;
-    let mesh = meshes.get(mesh_instance.mesh_asset_id)?;
-    let vertex_buffer_slice = mesh_allocator.mesh_vertex_slice(&mesh_instance.mesh_asset_id)?;
+    let Some(mesh_instance) = mesh_instances.get(&main_entity) else {
+        return;
+    };
+    let Some(mesh) = meshes.get(mesh_instance.mesh_asset_id) else {
+        return;
+    };
+    let Some(vertex_buffer_slice) = mesh_allocator.mesh_vertex_slice(&mesh_instance.mesh_asset_id)
+    else {
+        return;
+    };

    // Note that `IndirectParameters` covers both of these structures, even
    // though they actually have distinct layouts. See the comment above that
@ -1752,28 +1759,31 @@ fn get_batch_indirect_parameters_index(
        RenderMeshBufferInfo::Indexed {
            count: index_count, ..
        } => {
-            let index_buffer_slice =
-                mesh_allocator.mesh_index_slice(&mesh_instance.mesh_asset_id)?;
+            let Some(index_buffer_slice) =
+                mesh_allocator.mesh_index_slice(&mesh_instance.mesh_asset_id)
+            else {
+                return;
+            };
            IndirectParameters {
                vertex_or_index_count: index_count,
                instance_count: 0,
                first_vertex_or_first_index: index_buffer_slice.range.start,
                base_vertex_or_first_instance: vertex_buffer_slice.range.start,
-                first_instance: instance_index,
+                first_instance: 0,
            }
        }
        RenderMeshBufferInfo::NonIndexed => IndirectParameters {
            vertex_or_index_count: mesh.vertex_count,
            instance_count: 0,
            first_vertex_or_first_index: vertex_buffer_slice.range.start,
-            base_vertex_or_first_instance: instance_index,
-            first_instance: instance_index,
+            base_vertex_or_first_instance: 0,
+            // Use `0xffffffff` as a placeholder to tell the mesh preprocessing
+            // shader that this is a non-indexed mesh.
+            first_instance: !0,
        },
    };

-    (indirect_parameters_buffer.push(indirect_parameters) as u32)
-        .try_into()
-        .ok()
+    indirect_parameters_buffer.set(indirect_parameters_offset, indirect_parameters);
 }

 bitflags::bitflags! {
--- a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
+++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
@ -8,6 +8,7 @@
 // so that TAA works.

 #import bevy_pbr::mesh_types::{Mesh, MESH_FLAGS_NO_FRUSTUM_CULLING_BIT}
+#import bevy_pbr::mesh_preprocess_types::IndirectParameters
 #import bevy_render::maths
 #import bevy_render::view::View

@ -47,26 +48,11 @@ struct PreprocessWorkItem {
    // The index of the `MeshInput` in the `current_input` buffer that we read
    // from.
    input_index: u32,
-    // In direct mode, the index of the `Mesh` in `output` that we write to. In
-    // indirect mode, the index of the `IndirectParameters` in
-    // `indirect_parameters` that we write to.
+    // The index of the `Mesh` in `output` that we write to.
    output_index: u32,
-}
-
-// The `wgpu` indirect parameters structure. This is a union of two structures.
-// For more information, see the corresponding comment in
-// `gpu_preprocessing.rs`.
-struct IndirectParameters {
-    // `vertex_count` or `index_count`.
-    data0: u32,
-    // `instance_count` in both structures.
-    instance_count: atomic<u32>,
-    // `first_vertex` in both structures.
-    first_vertex: u32,
-    // `first_instance` or `base_vertex`.
-    data1: u32,
-    // A read-only copy of `instance_index`.
-    instance_index: u32,
+    // The index of the `IndirectParameters` in `indirect_parameters` that we
+    // write to.
+    indirect_parameters_index: u32,
 }

 // The current frame's `MeshInput`.
@ -138,9 +124,12 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
        return;
    }

-    // Unpack.
+    // Unpack the work item.
    let input_index = work_items[instance_index].input_index;
    let output_index = work_items[instance_index].output_index;
+    let indirect_parameters_index = work_items[instance_index].indirect_parameters_index;
+
+    // Unpack the input matrix.
    let world_from_local_affine_transpose = current_input[input_index].world_from_local;
    let world_from_local = maths::affine3_to_square(world_from_local_affine_transpose);

@ -181,11 +170,28 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
    // instance index in the indirect parameters structure. Otherwise, this
    // index was directly supplied to us.
 #ifdef INDIRECT
-    let mesh_output_index = indirect_parameters[output_index].instance_index +
-        atomicAdd(&indirect_parameters[output_index].instance_count, 1u);
-#else
+    let batch_output_index =
+        atomicAdd(&indirect_parameters[indirect_parameters_index].instance_count, 1u);
+    let mesh_output_index = output_index + batch_output_index;
+
+    // If this is the first mesh in the batch, write the first instance index
+    // into the indirect parameters.
+    //
+    // We could have done this on CPU, but when we start retaining indirect
+    // parameters that will no longer be desirable, as the index of the first
+    // instance will change from frame to frame and we won't want the CPU to
+    // have to keep updating it.
+    if (batch_output_index == 0u) {
+        if (indirect_parameters[indirect_parameters_index].first_instance == 0xffffffffu) {
+            indirect_parameters[indirect_parameters_index].base_vertex_or_first_instance =
+                mesh_output_index;
+        } else {
+            indirect_parameters[indirect_parameters_index].first_instance = mesh_output_index;
+        }
+    }
+#else   // INDIRECT
    let mesh_output_index = output_index;
-#endif
+#endif  // INDIRECT

    // Write the output.
    output[mesh_output_index].world_from_local = world_from_local_affine_transpose;
--- a/crates/bevy_pbr/src/render/mesh_preprocess_types.wgsl
+++ b/crates/bevy_pbr/src/render/mesh_preprocess_types.wgsl
@ -0,0 +1,19 @@
+// Types needed for GPU mesh uniform building.
+
+#define_import_path bevy_pbr::mesh_preprocess_types
+
+// The `wgpu` indirect parameters structure. This is a union of two structures.
+// For more information, see the corresponding comment in
+// `gpu_preprocessing.rs`.
+struct IndirectParameters {
+    // `vertex_count` or `index_count`.
+    vertex_count_or_index_count: u32,
+    // `instance_count` in both structures.
+    instance_count: atomic<u32>,
+    // `first_vertex` or `first_index`.
+    first_vertex_or_first_index: u32,
+    // `base_vertex` or `first_instance`.
+    base_vertex_or_first_instance: u32,
+    // A read-only copy of `instance_index`.
+    first_instance: u32,
+}
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@ -1,7 +1,6 @@
 //! Batching functionality when GPU preprocessing is in use.

 use bevy_app::{App, Plugin};
-use bevy_derive::{Deref, DerefMut};
 use bevy_ecs::{
    entity::{Entity, EntityHashMap},
    query::{Has, With},
@ -22,7 +21,7 @@ use crate::{
        SortedRenderPhase, UnbatchableBinnedEntityIndices, ViewBinnedRenderPhases,
        ViewSortedRenderPhases,
    },
-    render_resource::{BufferVec, GpuArrayBufferable, RawBufferVec, UninitBufferVec},
+    render_resource::{Buffer, BufferVec, GpuArrayBufferable, RawBufferVec, UninitBufferVec},
    renderer::{RenderAdapter, RenderDevice, RenderQueue},
    view::{ExtractedView, NoIndirectDrawing},
    Render, RenderApp, RenderSet,
@ -260,10 +259,13 @@ pub struct PreprocessWorkItem {
    /// The index of the batch input data in the input buffer that the shader
    /// reads from.
    pub input_index: u32,
-    /// In direct mode, this is the index of the `MeshUniform` in the output
-    /// buffer that we write to. In indirect mode, this is the index of the
-    /// [`IndirectParameters`].
+    /// The index of the `MeshUniform` in the output buffer that we write to.
+    /// In direct mode, this is the index of the uniform. In indirect mode, this
+    /// is the first index uniform in the batch set.
    pub output_index: u32,
+    /// The index of the [`IndirectParameters`] in the
+    /// [`IndirectParametersBuffer`].
+    pub indirect_parameters_index: u32,
 }

 /// The `wgpu` indirect parameters structure.
@ -331,15 +333,38 @@ pub struct IndirectParameters {
 }

 /// The buffer containing the list of [`IndirectParameters`], for draw commands.
-#[derive(Resource, Deref, DerefMut)]
-pub struct IndirectParametersBuffer(pub BufferVec<IndirectParameters>);
+#[derive(Resource)]
+pub struct IndirectParametersBuffer {
+    /// The actual buffer.
+    buffer: RawBufferVec<IndirectParameters>,
+}

 impl IndirectParametersBuffer {
    /// Creates the indirect parameters buffer.
    pub fn new() -> IndirectParametersBuffer {
-        IndirectParametersBuffer(BufferVec::new(
-            BufferUsages::STORAGE | BufferUsages::INDIRECT,
-        ))
+        IndirectParametersBuffer {
+            buffer: RawBufferVec::new(BufferUsages::STORAGE | BufferUsages::INDIRECT),
+        }
+    }
+
+    /// Returns the underlying GPU buffer.
+    #[inline]
+    pub fn buffer(&self) -> Option<&Buffer> {
+        self.buffer.buffer()
+    }
+
+    /// Adds a new set of indirect parameters to the buffer.
+    pub fn allocate(&mut self, count: u32) -> u32 {
+        let length = self.buffer.len();
+        self.buffer.reserve_internal(count as usize);
+        for _ in 0..count {
+            self.buffer.push(Zeroable::zeroed());
+        }
+        length as u32
+    }
+
+    pub fn set(&mut self, index: u32, value: IndirectParameters) {
+        self.buffer.set(index, value);
    }
 }

@ -362,7 +387,8 @@ impl FromWorld for GpuPreprocessingSupport {
            crate::get_adreno_model(adapter).is_some_and(|model| model != 720 && model <= 730)
        }

-        let max_supported_mode = if device.limits().max_compute_workgroup_size_x == 0 || is_non_supported_android_device(adapter)
+        let max_supported_mode = if device.limits().max_compute_workgroup_size_x == 0 ||
+            is_non_supported_android_device(adapter)
        {
            GpuPreprocessingMode::None
        } else if !device
@ -543,11 +569,21 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(

        // Walk through the list of phase items, building up batches as we go.
        let mut batch: Option<SortedRenderBatch<GFBD>> = None;
+
+        // Allocate the indirect parameters if necessary.
+        let mut indirect_parameters_offset = if no_indirect_drawing {
+            None
+        } else {
+            Some(indirect_parameters_buffer.allocate(phase.items.len() as u32))
+        };
+
+        let mut first_output_index = data_buffer.len() as u32;
+
        for current_index in 0..phase.items.len() {
            // Get the index of the input data, and comparison metadata, for
            // this entity.
            let item = &phase.items[current_index];
-            let entity = (item.entity(), item.main_entity());
+            let entity = item.main_entity();
            let current_batch_input_index =
                GFBD::get_index_and_compare_data(&system_param_item, entity);

@ -578,7 +614,7 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(

            // Make space in the data buffer for this instance.
            let item = &phase.items[current_index];
-            let entity = (item.entity(), item.main_entity());
+            let entity = item.main_entity();
            let output_index = data_buffer.add() as u32;

            // If we can't batch, break the existing batch and make a new one.
@ -589,22 +625,27 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
                }

                // Start a new batch.
-                let indirect_parameters_index = if !no_indirect_drawing {
-                    GFBD::get_batch_indirect_parameters_index(
+                if let Some(indirect_parameters_offset) = indirect_parameters_offset {
+                    GFBD::write_batch_indirect_parameters(
                        &system_param_item,
                        &mut indirect_parameters_buffer,
+                        indirect_parameters_offset,
                        entity,
-                        output_index,
-                    )
-                } else {
-                    None
+                    );
                };
+
                batch = Some(SortedRenderBatch {
                    phase_item_start_index: current_index as u32,
                    instance_start_index: output_index,
-                    indirect_parameters_index,
+                    indirect_parameters_index: indirect_parameters_offset.and_then(NonMaxU32::new),
                    meta: current_meta,
                });
+
+                if let Some(ref mut indirect_parameters_offset) = indirect_parameters_offset {
+                    *indirect_parameters_offset += 1;
+                }
+
+                first_output_index = output_index;
            }

            // Add a new preprocessing work item so that the preprocessing
@ -612,9 +653,14 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
            if let Some(batch) = batch.as_ref() {
                work_item_buffer.buffer.push(PreprocessWorkItem {
                    input_index: current_input_index.into(),
-                    output_index: match batch.indirect_parameters_index {
+                    output_index: if no_indirect_drawing {
+                        output_index
+                    } else {
+                        first_output_index
+                    },
+                    indirect_parameters_index: match batch.indirect_parameters_index {
                        Some(indirect_parameters_index) => indirect_parameters_index.into(),
-                        None => output_index,
+                        None => 0,
                    },
                });
            }
@ -671,10 +717,11 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
        let mut maybe_last_multidraw_key = None;

        for key in &phase.batchable_mesh_keys {
+            let first_output_index = data_buffer.len() as u32;
+
            let mut batch: Option<BinnedRenderPhaseBatch> = None;
-            for &(entity, main_entity) in &phase.batchable_mesh_values[key] {
-                let Some(input_index) =
-                    GFBD::get_binned_index(&system_param_item, (entity, main_entity))
+            for &(entity, main_entity) in &phase.batchable_mesh_values[key].entities {
+                let Some(input_index) = GFBD::get_binned_index(&system_param_item, main_entity)
                else {
                    continue;
                };
@ -682,37 +729,51 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(

                match batch {
                    Some(ref mut batch) => {
-                        // Append to the current batch.
                        batch.instance_range.end = output_index + 1;
+
+                        // Append to the current batch.
+                        //
+                        // If we're in indirect mode, then we write the first
+                        // output index of this batch, so that we have a
+                        // tightly-packed buffer if GPU culling discards some of
+                        // the instances. Otherwise, we can just write the
+                        // output index directly.
                        work_item_buffer.buffer.push(PreprocessWorkItem {
                            input_index: input_index.into(),
-                            output_index: match batch.extra_index {
+                            output_index: if no_indirect_drawing {
+                                output_index
+                            } else {
+                                first_output_index
+                            },
+                            indirect_parameters_index: match batch.extra_index {
                                PhaseItemExtraIndex::IndirectParametersIndex(ref range) => {
                                    range.start
                                }
                                PhaseItemExtraIndex::DynamicOffset(_)
-                                | PhaseItemExtraIndex::None => output_index,
+                                | PhaseItemExtraIndex::None => 0,
                            },
                        });
                    }

                    None if !no_indirect_drawing => {
                        // Start a new batch, in indirect mode.
-                        let indirect_parameters_index = GFBD::get_batch_indirect_parameters_index(
+                        let indirect_parameters_index = indirect_parameters_buffer.allocate(1);
+                        GFBD::write_batch_indirect_parameters(
                            &system_param_item,
                            &mut indirect_parameters_buffer,
-                            (entity, main_entity),
-                            output_index,
+                            indirect_parameters_index,
+                            main_entity,
                        );
                        work_item_buffer.buffer.push(PreprocessWorkItem {
                            input_index: input_index.into(),
-                            output_index: indirect_parameters_index.unwrap_or_default().into(),
+                            output_index: first_output_index,
+                            indirect_parameters_index,
                        });
                        batch = Some(BinnedRenderPhaseBatch {
                            representative_entity: (entity, main_entity),
                            instance_range: output_index..output_index + 1,
                            extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index(
-                                indirect_parameters_index,
+                                NonMaxU32::new(indirect_parameters_index),
                            ),
                        });
                    }
@ -722,6 +783,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                        work_item_buffer.buffer.push(PreprocessWorkItem {
                            input_index: input_index.into(),
                            output_index,
+                            indirect_parameters_index: 0,
                        });
                        batch = Some(BinnedRenderPhaseBatch {
                            representative_entity: (entity, main_entity),
@ -763,41 +825,49 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
        // Prepare unbatchables.
        for key in &phase.unbatchable_mesh_keys {
            let unbatchables = phase.unbatchable_mesh_values.get_mut(key).unwrap();
-            for &(entity, main_entity) in &unbatchables.entities {
-                let Some(input_index) =
-                    GFBD::get_binned_index(&system_param_item, (entity, main_entity))
+
+            // Allocate the indirect parameters if necessary.
+            let mut indirect_parameters_offset = if no_indirect_drawing {
+                None
+            } else {
+                Some(indirect_parameters_buffer.allocate(unbatchables.entities.len() as u32))
+            };
+
+            for &(_, main_entity) in &unbatchables.entities {
+                let Some(input_index) = GFBD::get_binned_index(&system_param_item, main_entity)
                else {
                    continue;
                };
                let output_index = data_buffer.add() as u32;

-                if !no_indirect_drawing {
+                if let Some(ref mut indirect_parameters_index) = indirect_parameters_offset {
                    // We're in indirect mode, so add an indirect parameters
                    // index.
-                    let indirect_parameters_index = GFBD::get_batch_indirect_parameters_index(
+                    GFBD::write_batch_indirect_parameters(
                        &system_param_item,
                        &mut indirect_parameters_buffer,
-                        (entity, main_entity),
-                        output_index,
-                    )
-                    .unwrap_or_default();
+                        *indirect_parameters_index,
+                        main_entity,
+                    );
                    work_item_buffer.buffer.push(PreprocessWorkItem {
                        input_index: input_index.into(),
-                        output_index: indirect_parameters_index.into(),
+                        output_index,
+                        indirect_parameters_index: *indirect_parameters_index,
                    });
                    unbatchables
                        .buffer_indices
                        .add(UnbatchableBinnedEntityIndices {
-                            instance_index: indirect_parameters_index.into(),
+                            instance_index: *indirect_parameters_index,
                            extra_index: PhaseItemExtraIndex::IndirectParametersIndex(
-                                u32::from(indirect_parameters_index)
-                                    ..(u32::from(indirect_parameters_index) + 1),
+                                *indirect_parameters_index..(*indirect_parameters_index + 1),
                            ),
                        });
+                    *indirect_parameters_index += 1;
                } else {
                    work_item_buffer.buffer.push(PreprocessWorkItem {
                        input_index: input_index.into(),
                        output_index,
+                        indirect_parameters_index: 0,
                    });
                    unbatchables
                        .buffer_indices
@ -846,6 +916,8 @@ pub fn write_indirect_parameters_buffer(
    render_queue: Res<RenderQueue>,
    mut indirect_parameters_buffer: ResMut<IndirectParametersBuffer>,
 ) {
-    indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
-    indirect_parameters_buffer.clear();
+    indirect_parameters_buffer
+        .buffer
+        .write_buffer(&render_device, &render_queue);
+    indirect_parameters_buffer.buffer.clear();
 }
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@ -114,7 +114,7 @@ pub trait GetFullBatchData: GetBatchData {
    /// [`GetFullBatchData::get_index_and_compare_data`] instead.
    fn get_binned_batch_data(
        param: &SystemParamItem<Self::Param>,
-        query_item: (Entity, MainEntity),
+        query_item: MainEntity,
    ) -> Option<Self::BufferData>;

    /// Returns the index of the [`GetFullBatchData::BufferInputData`] that the
@ -126,7 +126,7 @@ pub trait GetFullBatchData: GetBatchData {
    /// function will never be called.
    fn get_index_and_compare_data(
        param: &SystemParamItem<Self::Param>,
-        query_item: (Entity, MainEntity),
+        query_item: MainEntity,
    ) -> Option<(NonMaxU32, Option<Self::CompareData>)>;

    /// Returns the index of the [`GetFullBatchData::BufferInputData`] that the
@ -138,21 +138,21 @@ pub trait GetFullBatchData: GetBatchData {
    /// function will never be called.
    fn get_binned_index(
        param: &SystemParamItem<Self::Param>,
-        query_item: (Entity, MainEntity),
+        query_item: MainEntity,
    ) -> Option<NonMaxU32>;

-    /// Pushes [`gpu_preprocessing::IndirectParameters`] necessary to draw this
-    /// batch onto the given [`IndirectParametersBuffer`], and returns its
+    /// Writes the [`gpu_preprocessing::IndirectParameters`] necessary to draw
+    /// this batch into the given [`IndirectParametersBuffer`] at the given
    /// index.
    ///
    /// This is only used if GPU culling is enabled (which requires GPU
    /// preprocessing).
-    fn get_batch_indirect_parameters_index(
+    fn write_batch_indirect_parameters(
        param: &SystemParamItem<Self::Param>,
        indirect_parameters_buffer: &mut IndirectParametersBuffer,
-        entity: (Entity, MainEntity),
-        instance_index: u32,
-    ) -> Option<NonMaxU32>;
+        indirect_parameters_offset: u32,
+        entity: MainEntity,
+    );
 }

 /// Sorts a render phase that uses bins.
--- a/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
@ -108,9 +108,9 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(

        for key in &phase.batchable_mesh_keys {
            let mut batch_set: SmallVec<[BinnedRenderPhaseBatch; 1]> = smallvec![];
-            for &(entity, main_entity) in &phase.batchable_mesh_values[key] {
+            for &(entity, main_entity) in &phase.batchable_mesh_values[key].entities {
                let Some(buffer_data) =
-                    GFBD::get_binned_batch_data(&system_param_item, (entity, main_entity))
+                    GFBD::get_binned_batch_data(&system_param_item, main_entity)
                else {
                    continue;
                };
@ -156,8 +156,9 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
        // Prepare unbatchables.
        for key in &phase.unbatchable_mesh_keys {
            let unbatchables = phase.unbatchable_mesh_values.get_mut(key).unwrap();
-            for &entity in &unbatchables.entities {
-                let Some(buffer_data) = GFBD::get_binned_batch_data(&system_param_item, entity)
+            for &(_, main_entity) in &unbatchables.entities {
+                let Some(buffer_data) =
+                    GFBD::get_binned_batch_data(&system_param_item, main_entity)
                else {
                    continue;
                };
--- a/crates/bevy_render/src/render_phase/mod.rs
+++ b/crates/bevy_render/src/render_phase/mod.rs
@ -95,7 +95,7 @@ where
    ///
    /// Each bin corresponds to a single batch set. For unbatchable entities,
    /// prefer `unbatchable_values` instead.
-    pub batchable_mesh_values: HashMap<BPI::BinKey, Vec<(Entity, MainEntity)>>,
+    pub batchable_mesh_values: HashMap<BPI::BinKey, RenderBin>,

    /// A list of `BinKey`s for unbatchable items.
    ///
@ -130,6 +130,13 @@ where
    pub(crate) batch_sets: BinnedRenderPhaseBatchSets,
 }

+/// All entities that share a mesh and a material and can be batched as part of
+/// a [`BinnedRenderPhase`].
+pub struct RenderBin {
+    /// A list of the entities in each bin.
+    pub entities: Vec<(Entity, MainEntity)>,
+}
+
 /// How we store and render the batch sets.
 ///
 /// Each one of these corresponds to a [`GpuPreprocessingMode`].
@ -304,27 +311,33 @@ where
    pub fn add(
        &mut self,
        key: BPI::BinKey,
-        entity: (Entity, MainEntity),
+        (entity, main_entity): (Entity, MainEntity),
        phase_type: BinnedRenderPhaseType,
    ) {
        match phase_type {
            BinnedRenderPhaseType::BatchableMesh => {
                match self.batchable_mesh_values.entry(key.clone()) {
-                    Entry::Occupied(mut entry) => entry.get_mut().push(entity),
+                    Entry::Occupied(mut entry) => {
+                        entry.get_mut().entities.push((entity, main_entity));
+                    }
                    Entry::Vacant(entry) => {
                        self.batchable_mesh_keys.push(key);
-                        entry.insert(vec![entity]);
+                        entry.insert(RenderBin {
+                            entities: vec![(entity, main_entity)],
+                        });
                    }
                }
            }

            BinnedRenderPhaseType::UnbatchableMesh => {
                match self.unbatchable_mesh_values.entry(key.clone()) {
-                    Entry::Occupied(mut entry) => entry.get_mut().entities.push(entity),
+                    Entry::Occupied(mut entry) => {
+                        entry.get_mut().entities.push((entity, main_entity));
+                    }
                    Entry::Vacant(entry) => {
                        self.unbatchable_mesh_keys.push(key);
                        entry.insert(UnbatchableBinnedEntities {
-                            entities: vec![entity],
+                            entities: vec![(entity, main_entity)],
                            buffer_indices: default(),
                        });
                    }
@ -333,7 +346,7 @@ where

            BinnedRenderPhaseType::NonMesh => {
                // We don't process these items further.
-                self.non_mesh_items.push((key, entity));
+                self.non_mesh_items.push((key, (entity, main_entity)));
            }
        }
    }
--- a/crates/bevy_render/src/render_resource/buffer_vec.rs
+++ b/crates/bevy_render/src/render_resource/buffer_vec.rs
@ -103,6 +103,20 @@ impl<T: NoUninit> RawBufferVec<T> {
        self.values.append(&mut other.values);
    }

+    /// Sets the value at the given index.
+    ///
+    /// The index must be less than [`RawBufferVec::len`].
+    pub fn set(&mut self, index: u32, value: T) {
+        self.values[index as usize] = value;
+    }
+
+    /// Preallocates space for `count` elements in the internal CPU-side buffer.
+    ///
+    /// Unlike [`RawBufferVec::reserve`], this doesn't have any effect on the GPU buffer.
+    pub fn reserve_internal(&mut self, count: usize) {
+        self.values.reserve(count);
+    }
+
    /// Changes the debugging label of the buffer.
    ///
    /// The next time the buffer is updated (via [`reserve`](Self::reserve)), Bevy will inform
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@ -375,7 +375,7 @@ impl GetFullBatchData for Mesh2dPipeline {

    fn get_binned_batch_data(
        (mesh_instances, _, _): &SystemParamItem<Self::Param>,
-        (_entity, main_entity): (Entity, MainEntity),
+        main_entity: MainEntity,
    ) -> Option<Self::BufferData> {
        let mesh_instance = mesh_instances.get(&main_entity)?;
        Some((&mesh_instance.transforms).into())
@ -383,7 +383,7 @@ impl GetFullBatchData for Mesh2dPipeline {

    fn get_index_and_compare_data(
        _: &SystemParamItem<Self::Param>,
-        _query_item: (Entity, MainEntity),
+        _query_item: MainEntity,
    ) -> Option<(NonMaxU32, Option<Self::CompareData>)> {
        error!(
            "`get_index_and_compare_data` is only intended for GPU mesh uniform building, \
@ -394,7 +394,7 @@ impl GetFullBatchData for Mesh2dPipeline {

    fn get_binned_index(
        _: &SystemParamItem<Self::Param>,
-        _query_item: (Entity, MainEntity),
+        _query_item: MainEntity,
    ) -> Option<NonMaxU32> {
        error!(
            "`get_binned_index` is only intended for GPU mesh uniform building, \
@ -403,15 +403,23 @@ impl GetFullBatchData for Mesh2dPipeline {
        None
    }

-    fn get_batch_indirect_parameters_index(
+    fn write_batch_indirect_parameters(
        (mesh_instances, meshes, mesh_allocator): &SystemParamItem<Self::Param>,
        indirect_parameters_buffer: &mut bevy_render::batching::gpu_preprocessing::IndirectParametersBuffer,
-        (_entity, main_entity): (Entity, MainEntity),
-        instance_index: u32,
-    ) -> Option<NonMaxU32> {
-        let mesh_instance = mesh_instances.get(&main_entity)?;
-        let mesh = meshes.get(mesh_instance.mesh_asset_id)?;
-        let vertex_buffer_slice = mesh_allocator.mesh_vertex_slice(&mesh_instance.mesh_asset_id)?;
+        indirect_parameters_offset: u32,
+        main_entity: MainEntity,
+    ) {
+        let Some(mesh_instance) = mesh_instances.get(&main_entity) else {
+            return;
+        };
+        let Some(mesh) = meshes.get(mesh_instance.mesh_asset_id) else {
+            return;
+        };
+        let Some(vertex_buffer_slice) =
+            mesh_allocator.mesh_vertex_slice(&mesh_instance.mesh_asset_id)
+        else {
+            return;
+        };

        // Note that `IndirectParameters` covers both of these structures, even
        // though they actually have distinct layouts. See the comment above that
@ -420,28 +428,31 @@ impl GetFullBatchData for Mesh2dPipeline {
            RenderMeshBufferInfo::Indexed {
                count: index_count, ..
            } => {
-                let index_buffer_slice =
-                    mesh_allocator.mesh_index_slice(&mesh_instance.mesh_asset_id)?;
+                let Some(index_buffer_slice) =
+                    mesh_allocator.mesh_index_slice(&mesh_instance.mesh_asset_id)
+                else {
+                    return;
+                };
                IndirectParameters {
                    vertex_or_index_count: index_count,
                    instance_count: 0,
                    first_vertex_or_first_index: index_buffer_slice.range.start,
                    base_vertex_or_first_instance: vertex_buffer_slice.range.start,
-                    first_instance: instance_index,
+                    first_instance: 0,
                }
            }
            RenderMeshBufferInfo::NonIndexed => IndirectParameters {
                vertex_or_index_count: mesh.vertex_count,
                instance_count: 0,
                first_vertex_or_first_index: vertex_buffer_slice.range.start,
-                base_vertex_or_first_instance: instance_index,
-                first_instance: instance_index,
+                base_vertex_or_first_instance: 0,
+                // Use `0xffffffff` as a placeholder to tell the mesh
+                // preprocessing shader that this is a non-indexed mesh.
+                first_instance: !0,
            },
        };

-        (indirect_parameters_buffer.push(indirect_parameters) as u32)
-            .try_into()
-            .ok()
+        indirect_parameters_buffer.set(indirect_parameters_offset, indirect_parameters);
    }
 }