Build batches across phases in parallel. (#17764)

Currently, invocations of `batch_and_prepare_binned_render_phase` and `batch_and_prepare_sorted_render_phase` can't run in parallel because they write to scene-global GPU buffers. After PR #17698, `batch_and_prepare_binned_render_phase` started accounting for the lion's share of the CPU time, causing us to be strongly CPU bound on scenes like Caldera when occlusion culling was on (because of the overhead of batching for the Z-prepass). Although I eventually plan to optimize `batch_and_prepare_binned_render_phase`, we can obtain significant wins now by parallelizing that system across phases. This commit splits all GPU buffers that `batch_and_prepare_binned_render_phase` and `batch_and_prepare_sorted_render_phase` touches into separate buffers for each phase so that the scheduler will run those phases in parallel. At the end of batch preparation, we gather the render phases up into a single resource with a new *collection* phase. Because we already run mesh preprocessing separately for each phase in order to make occlusion culling work, this is actually a cleaner separation. For example, mesh output indices (the unique ID that identifies each mesh instance on GPU) are now guaranteed to be sequential starting from 0, which will simplify the forthcoming work to remove them in favor of the compute dispatch ID. On Caldera, this brings the frame time down to approximately 9.1 ms with occlusion culling on. ![Screenshot 2025-02-08 210720](https://github.com/user-attachments/assets/44bed500-e323-4786-b40c-828b75bc7d3f)
2025-02-12 16:02:20 -08:00 · 2025-02-12 16:02:20 -08:00 · 0ede857103
commit 0ede857103
parent 62c1812e72
15 changed files with 967 additions and 454 deletions
--- a/crates/bevy_pbr/src/decal/forward.rs
+++ b/crates/bevy_pbr/src/decal/forward.rs
@ -14,6 +14,7 @@ use bevy_render::{
        AsBindGroup, CompareFunction, RenderPipelineDescriptor, Shader,
        SpecializedMeshPipelineError,
    },
    RenderDebugFlags,
 };
 const FORWARD_DECAL_MESH_HANDLE: Handle<Mesh> =
@ -48,6 +49,7 @@ impl Plugin for ForwardDecalPlugin {
        app.add_plugins(MaterialPlugin::<ForwardDecalMaterial<StandardMaterial>> {
            prepass_enabled: false,
            shadows_enabled: false,
            debug_flags: RenderDebugFlags::default(),
            ..Default::default()
        });
    }
--- a/crates/bevy_pbr/src/lib.rs
+++ b/crates/bevy_pbr/src/lib.rs
@ -125,7 +125,7 @@ use bevy_render::{
    sync_component::SyncComponentPlugin,
    texture::GpuImage,
    view::VisibilitySystems,
-    ExtractSchedule, Render, RenderApp, RenderSet,
+    ExtractSchedule, Render, RenderApp, RenderDebugFlags, RenderSet,
 };
 use bevy_transform::TransformSystem;
@ -182,6 +182,8 @@ pub struct PbrPlugin {
    /// This requires compute shader support and so will be forcibly disabled if
    /// the platform doesn't support those.
    pub use_gpu_instance_buffer_builder: bool,
    /// Debugging flags that can optionally be set when constructing the renderer.
    pub debug_flags: RenderDebugFlags,
 }
 impl Default for PbrPlugin {
@ -190,6 +192,7 @@ impl Default for PbrPlugin {
            prepass_enabled: true,
            add_default_deferred_lighting_plugin: true,
            use_gpu_instance_buffer_builder: true,
            debug_flags: RenderDebugFlags::default(),
        }
    }
 }
@ -333,9 +336,11 @@ impl Plugin for PbrPlugin {
            .add_plugins((
                MeshRenderPlugin {
                    use_gpu_instance_buffer_builder: self.use_gpu_instance_buffer_builder,
                    debug_flags: self.debug_flags,
                },
                MaterialPlugin::<StandardMaterial> {
                    prepass_enabled: self.prepass_enabled,
                    debug_flags: self.debug_flags,
                    ..Default::default()
                },
                ScreenSpaceAmbientOcclusionPlugin,
--- a/crates/bevy_pbr/src/material.rs
+++ b/crates/bevy_pbr/src/material.rs
@ -252,6 +252,8 @@ pub struct MaterialPlugin<M: Material> {
    pub prepass_enabled: bool,
    /// Controls if shadows are enabled for the Material.
    pub shadows_enabled: bool,
    /// Debugging flags that can optionally be set when constructing the renderer.
    pub debug_flags: RenderDebugFlags,
    pub _marker: PhantomData<M>,
 }
@ -260,6 +262,7 @@ impl<M: Material> Default for MaterialPlugin<M> {
        Self {
            prepass_enabled: true,
            shadows_enabled: true,
            debug_flags: RenderDebugFlags::default(),
            _marker: Default::default(),
        }
    }
@ -374,7 +377,7 @@ where
        }
        if self.prepass_enabled {
-            app.add_plugins(PrepassPlugin::<M>::default());
+            app.add_plugins(PrepassPlugin::<M>::new(self.debug_flags));
        }
    }
--- a/crates/bevy_pbr/src/prepass/mod.rs
+++ b/crates/bevy_pbr/src/prepass/mod.rs
@ -19,7 +19,7 @@ use bevy_render::{
    renderer::RenderAdapter,
    sync_world::RenderEntity,
    view::{RenderVisibilityRanges, VISIBILITY_RANGES_STORAGE_BUFFER_COUNT},
-    ExtractSchedule, Render, RenderApp, RenderSet,
+    ExtractSchedule, Render, RenderApp, RenderDebugFlags, RenderSet,
 };
 pub use prepass_bindings::*;
@ -146,11 +146,19 @@ where
 /// Sets up the prepasses for a [`Material`].
 ///
 /// This depends on the [`PrepassPipelinePlugin`].
-pub struct PrepassPlugin<M: Material>(PhantomData<M>);
+pub struct PrepassPlugin<M: Material> {
    /// Debugging flags that can optionally be set when constructing the renderer.
    pub debug_flags: RenderDebugFlags,
    pub phantom: PhantomData<M>,
 }
-impl<M: Material> Default for PrepassPlugin<M> {
+impl<M: Material> PrepassPlugin<M> {
-    fn default() -> Self {
+    /// Creates a new [`PrepassPlugin`] with the given debug flags.
-        Self(Default::default())
+    pub fn new(debug_flags: RenderDebugFlags) -> Self {
        PrepassPlugin {
            debug_flags,
            phantom: PhantomData,
        }
    }
 }
@ -176,8 +184,10 @@ where
                    ),
                )
                .add_plugins((
-                    BinnedRenderPhasePlugin::<Opaque3dPrepass, MeshPipeline>::default(),
+                    BinnedRenderPhasePlugin::<Opaque3dPrepass, MeshPipeline>::new(self.debug_flags),
-                    BinnedRenderPhasePlugin::<AlphaMask3dPrepass, MeshPipeline>::default(),
+                    BinnedRenderPhasePlugin::<AlphaMask3dPrepass, MeshPipeline>::new(
                        self.debug_flags,
                    ),
                ));
        }
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@ -29,12 +29,14 @@ use bevy_ecs::{
    system::{lifetimeless::Read, Commands, Query, Res, ResMut},
    world::{FromWorld, World},
 };
 use bevy_render::batching::gpu_preprocessing::UntypedPhaseIndirectParametersBuffers;
 use bevy_render::{
    batching::gpu_preprocessing::{
        BatchedInstanceBuffers, GpuOcclusionCullingWorkItemBuffers, GpuPreprocessingSupport,
        IndirectBatchSet, IndirectParametersBuffers, IndirectParametersIndexed,
        IndirectParametersMetadata, IndirectParametersNonIndexed,
        LatePreprocessWorkItemIndirectParameters, PreprocessWorkItem, PreprocessWorkItemBuffers,
        UntypedPhaseBatchedInstanceBuffers,
    },
    experimental::occlusion_culling::OcclusionCulling,
    render_graph::{Node, NodeRunError, RenderGraphApp, RenderGraphContext},
@ -393,8 +395,22 @@ pub enum PhasePreprocessBindGroups {
 /// The bind groups for the compute shaders that reset indirect draw counts and
 /// build indirect parameters.
-#[derive(Resource)]
+///
-pub struct BuildIndirectParametersBindGroups {
+/// There's one set of bind group for each phase. Phases are keyed off their
 /// [`core::any::TypeId`].
 #[derive(Resource, Default, Deref, DerefMut)]
 pub struct BuildIndirectParametersBindGroups(pub TypeIdMap<PhaseBuildIndirectParametersBindGroups>);
 impl BuildIndirectParametersBindGroups {
    /// Creates a new, empty [`BuildIndirectParametersBindGroups`] table.
    pub fn new() -> BuildIndirectParametersBindGroups {
        Self::default()
    }
 }
 /// The per-phase set of bind groups for the compute shaders that reset indirect
 /// draw counts and build indirect parameters.
 pub struct PhaseBuildIndirectParametersBindGroups {
    /// The bind group for the `reset_indirect_batch_sets.wgsl` shader, for
    /// indexed meshes.
    reset_indexed_indirect_batch_sets: Option<BindGroup>,
@ -470,9 +486,10 @@ impl Plugin for GpuMeshPreprocessPlugin {
                (
                    prepare_preprocess_pipelines.in_set(RenderSet::Prepare),
                    prepare_preprocess_bind_groups
-                        .run_if(
+                        .run_if(resource_exists::<BatchedInstanceBuffers<
-                            resource_exists::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>,
+                            MeshUniform,
-                        )
+                            MeshInputUniform
                        >>)
                        .in_set(RenderSet::PrepareBindGroups),
                    write_mesh_culling_data_buffer.in_set(RenderSet::PrepareResourcesFlush),
                ),
@ -511,7 +528,7 @@ impl Plugin for GpuMeshPreprocessPlugin {
            .add_render_graph_edge(
                Core3d,
                NodePbr::MainBuildIndirectParameters,
-                Node3d::DeferredPrepass
+                Node3d::DeferredPrepass,
            );
    }
 }
@ -538,10 +555,8 @@ impl Node for EarlyGpuPreprocessNode {
        world: &'w World,
    ) -> Result<(), NodeRunError> {
        // Grab the [`BatchedInstanceBuffers`].
-        let BatchedInstanceBuffers {
+        let batched_instance_buffers =
-            work_item_buffers: ref index_buffers,
+            world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>();
            ..
        } = world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>();
        let pipeline_cache = world.resource::<PipelineCache>();
        let preprocess_pipelines = world.resource::<PreprocessPipelines>();
@ -583,13 +598,6 @@ impl Node for EarlyGpuPreprocessNode {
                continue;
            };
            // Grab the work item buffers for this view.
            let Some(phase_work_item_buffers) = index_buffers.get(&view.retained_view_entity)
            else {
                warn!("The preprocessing index buffer wasn't present");
                continue;
            };
            // Select the right pipeline, depending on whether GPU culling is in
            // use.
            let maybe_pipeline_id = if no_indirect_drawing {
@ -620,7 +628,17 @@ impl Node for EarlyGpuPreprocessNode {
            compute_pass.set_pipeline(preprocess_pipeline);
            // Loop over each render phase.
-            for (phase_type_id, work_item_buffers) in phase_work_item_buffers {
+            for (phase_type_id, batched_phase_instance_buffers) in
                &batched_instance_buffers.phase_instance_buffers
            {
                // Grab the work item buffers for this view.
                let Some(work_item_buffers) = batched_phase_instance_buffers
                    .work_item_buffers
                    .get(&view.retained_view_entity)
                else {
                    continue;
                };
                // Fetch the bind group for the render phase.
                let Some(phase_bind_groups) = bind_groups.get(phase_type_id) else {
                    continue;
@ -775,12 +793,8 @@ impl Node for LateGpuPreprocessNode {
        world: &'w World,
    ) -> Result<(), NodeRunError> {
        // Grab the [`BatchedInstanceBuffers`].
-        let BatchedInstanceBuffers {
+        let batched_instance_buffers =
-            ref work_item_buffers,
+            world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>();
            ref late_indexed_indirect_parameters_buffer,
            ref late_non_indexed_indirect_parameters_buffer,
            ..
        } = world.resource::<BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>();
        let pipeline_cache = world.resource::<PipelineCache>();
        let preprocess_pipelines = world.resource::<PreprocessPipelines>();
@ -795,13 +809,6 @@ impl Node for LateGpuPreprocessNode {
        // Run the compute passes.
        for (view, bind_groups, view_uniform_offset) in self.view_query.iter_manual(world) {
            // Grab the work item buffers for this view.
            let Some(phase_work_item_buffers) = work_item_buffers.get(&view.retained_view_entity)
            else {
                warn!("The preprocessing index buffer wasn't present");
                continue;
            };
            let maybe_pipeline_id = preprocess_pipelines
                .late_gpu_occlusion_culling_preprocess
                .pipeline_id;
@ -821,7 +828,25 @@ impl Node for LateGpuPreprocessNode {
            compute_pass.set_pipeline(preprocess_pipeline);
-            for (phase_type_id, work_item_buffers) in phase_work_item_buffers {
+            // Loop over each phase. Because we built the phases in parallel,
            // each phase has a separate set of instance buffers.
            for (phase_type_id, batched_phase_instance_buffers) in
                &batched_instance_buffers.phase_instance_buffers
            {
                let UntypedPhaseBatchedInstanceBuffers {
                    ref work_item_buffers,
                    ref late_indexed_indirect_parameters_buffer,
                    ref late_non_indexed_indirect_parameters_buffer,
                    ..
                } = *batched_phase_instance_buffers;
                // Grab the work item buffers for this view.
                let Some(phase_work_item_buffers) =
                    work_item_buffers.get(&view.retained_view_entity)
                else {
                    continue;
                };
                let (
                    PreprocessWorkItemBuffers::Indirect {
                        gpu_occlusion_culling:
@ -840,7 +865,7 @@ impl Node for LateGpuPreprocessNode {
                    Some(late_indexed_indirect_parameters_buffer),
                    Some(late_non_indexed_indirect_parameters_buffer),
                ) = (
-                    work_item_buffers,
+                    phase_work_item_buffers,
                    bind_groups.get(phase_type_id),
                    late_indexed_indirect_parameters_buffer.buffer(),
                    late_non_indexed_indirect_parameters_buffer.buffer(),
@ -1029,57 +1054,69 @@ fn run_build_indirect_parameters_node(
        return Ok(());
    };
-    // Build indexed indirect parameters.
+    // Loop over each phase. As each has as separate set of buffers, we need to
-    if let (
+    // build indirect parameters individually for each phase.
-        Some(reset_indexed_indirect_batch_sets_bind_group),
+    for (phase_type_id, phase_build_indirect_params_bind_groups) in
-        Some(build_indirect_indexed_params_bind_group),
+        build_indirect_params_bind_groups.iter()
-    ) = (
+    {
-        &build_indirect_params_bind_groups.reset_indexed_indirect_batch_sets,
+        let Some(phase_indirect_parameters_buffers) =
-        &build_indirect_params_bind_groups.build_indexed_indirect,
+            indirect_parameters_buffers.get(phase_type_id)
-    ) {
+        else {
-        compute_pass.set_pipeline(reset_indirect_batch_sets_pipeline);
+            continue;
-        compute_pass.set_bind_group(0, reset_indexed_indirect_batch_sets_bind_group, &[]);
+        };
-        let workgroup_count = indirect_parameters_buffers
+
-            .batch_set_count(true)
+        // Build indexed indirect parameters.
-            .div_ceil(WORKGROUP_SIZE);
+        if let (
-        if workgroup_count > 0 {
+            Some(reset_indexed_indirect_batch_sets_bind_group),
-            compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
+            Some(build_indirect_indexed_params_bind_group),
        ) = (
            &phase_build_indirect_params_bind_groups.reset_indexed_indirect_batch_sets,
            &phase_build_indirect_params_bind_groups.build_indexed_indirect,
        ) {
            compute_pass.set_pipeline(reset_indirect_batch_sets_pipeline);
            compute_pass.set_bind_group(0, reset_indexed_indirect_batch_sets_bind_group, &[]);
            let workgroup_count = phase_indirect_parameters_buffers
                .batch_set_count(true)
                .div_ceil(WORKGROUP_SIZE);
            if workgroup_count > 0 {
                compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
            }
            compute_pass.set_pipeline(build_indexed_indirect_params_pipeline);
            compute_pass.set_bind_group(0, build_indirect_indexed_params_bind_group, &[]);
            let workgroup_count = phase_indirect_parameters_buffers
                .indexed_batch_count()
                .div_ceil(WORKGROUP_SIZE);
            if workgroup_count > 0 {
                compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
            }
        }
-        compute_pass.set_pipeline(build_indexed_indirect_params_pipeline);
+        // Build non-indexed indirect parameters.
-        compute_pass.set_bind_group(0, build_indirect_indexed_params_bind_group, &[]);
+        if let (
-        let workgroup_count = indirect_parameters_buffers
+            Some(reset_non_indexed_indirect_batch_sets_bind_group),
-            .indexed_batch_count()
+            Some(build_indirect_non_indexed_params_bind_group),
-            .div_ceil(WORKGROUP_SIZE);
+        ) = (
-        if workgroup_count > 0 {
+            &phase_build_indirect_params_bind_groups.reset_non_indexed_indirect_batch_sets,
-            compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
+            &phase_build_indirect_params_bind_groups.build_non_indexed_indirect,
-        }
+        ) {
-    }
+            compute_pass.set_pipeline(reset_indirect_batch_sets_pipeline);
            compute_pass.set_bind_group(0, reset_non_indexed_indirect_batch_sets_bind_group, &[]);
            let workgroup_count = phase_indirect_parameters_buffers
                .batch_set_count(false)
                .div_ceil(WORKGROUP_SIZE);
            if workgroup_count > 0 {
                compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
            }
-    // Build non-indexed indirect parameters.
+            compute_pass.set_pipeline(build_non_indexed_indirect_params_pipeline);
-    if let (
+            compute_pass.set_bind_group(0, build_indirect_non_indexed_params_bind_group, &[]);
-        Some(reset_non_indexed_indirect_batch_sets_bind_group),
+            let workgroup_count = phase_indirect_parameters_buffers
-        Some(build_indirect_non_indexed_params_bind_group),
+                .non_indexed_batch_count()
-    ) = (
+                .div_ceil(WORKGROUP_SIZE);
-        &build_indirect_params_bind_groups.reset_non_indexed_indirect_batch_sets,
+            if workgroup_count > 0 {
-        &build_indirect_params_bind_groups.build_non_indexed_indirect,
+                compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
-    ) {
+            }
        compute_pass.set_pipeline(reset_indirect_batch_sets_pipeline);
        compute_pass.set_bind_group(0, reset_non_indexed_indirect_batch_sets_bind_group, &[]);
        let workgroup_count = indirect_parameters_buffers
            .batch_set_count(false)
            .div_ceil(WORKGROUP_SIZE);
        if workgroup_count > 0 {
            compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
        }
        compute_pass.set_pipeline(build_non_indexed_indirect_params_pipeline);
        compute_pass.set_bind_group(0, build_indirect_non_indexed_params_bind_group, &[]);
        let workgroup_count = indirect_parameters_buffers
            .non_indexed_batch_count()
            .div_ceil(WORKGROUP_SIZE);
        if workgroup_count > 0 {
            compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
        }
    }
@ -1637,18 +1674,14 @@ pub fn prepare_preprocess_bind_groups(
 ) {
    // Grab the `BatchedInstanceBuffers`.
    let BatchedInstanceBuffers {
        data_buffer: ref data_buffer_vec,
        ref work_item_buffers,
        current_input_buffer: ref current_input_buffer_vec,
        previous_input_buffer: ref previous_input_buffer_vec,
-        ref late_indexed_indirect_parameters_buffer,
+        ref phase_instance_buffers,
        ref late_non_indexed_indirect_parameters_buffer,
    } = batched_instance_buffers.into_inner();
-    let (Some(current_input_buffer), Some(previous_input_buffer), Some(data_buffer)) = (
+    let (Some(current_input_buffer), Some(previous_input_buffer)) = (
        current_input_buffer_vec.buffer().buffer(),
        previous_input_buffer_vec.buffer().buffer(),
        data_buffer_vec.buffer(),
    ) else {
        return;
    };
@ -1659,22 +1692,39 @@ pub fn prepare_preprocess_bind_groups(
    // Loop over each view.
    for (view_entity, view) in &views {
        let Some(phase_work_item_buffers) = work_item_buffers.get(&view.retained_view_entity)
        else {
            continue;
        };
        let mut bind_groups = TypeIdMap::default();
        // Loop over each phase.
-        for (&phase_id, work_item_buffers) in phase_work_item_buffers {
+        for (phase_type_id, phase_instance_buffers) in phase_instance_buffers {
            let UntypedPhaseBatchedInstanceBuffers {
                data_buffer: ref data_buffer_vec,
                ref work_item_buffers,
                ref late_indexed_indirect_parameters_buffer,
                ref late_non_indexed_indirect_parameters_buffer,
            } = *phase_instance_buffers;
            let Some(data_buffer) = data_buffer_vec.buffer() else {
                continue;
            };
            // Grab the indirect parameters buffers for this phase.
            let Some(phase_indirect_parameters_buffers) =
                indirect_parameters_buffers.get(phase_type_id)
            else {
                continue;
            };
            let Some(work_item_buffers) = work_item_buffers.get(&view.retained_view_entity) else {
                continue;
            };
            // Create the `PreprocessBindGroupBuilder`.
            let preprocess_bind_group_builder = PreprocessBindGroupBuilder {
                view: view_entity,
                late_indexed_indirect_parameters_buffer,
                late_non_indexed_indirect_parameters_buffer,
                render_device: &render_device,
-                indirect_parameters_buffers: &indirect_parameters_buffers,
+                phase_indirect_parameters_buffers,
                mesh_culling_data_buffer: &mesh_culling_data_buffer,
                view_uniforms: &view_uniforms,
                previous_view_uniforms: &previous_view_uniforms,
@ -1725,7 +1775,7 @@ pub fn prepare_preprocess_bind_groups(
            // Write that bind group in.
            if let Some(bind_group) = bind_group {
                any_indirect = any_indirect || was_indirect;
-                bind_groups.insert(phase_id, bind_group);
+                bind_groups.insert(*phase_type_id, bind_group);
            }
        }
@ -1764,7 +1814,7 @@ struct PreprocessBindGroupBuilder<'a> {
    /// The device.
    render_device: &'a RenderDevice,
    /// The buffers that store indirect draw parameters.
-    indirect_parameters_buffers: &'a IndirectParametersBuffers,
+    phase_indirect_parameters_buffers: &'a UntypedPhaseIndirectParametersBuffers,
    /// The GPU buffer that stores the information needed to cull each mesh.
    mesh_culling_data_buffer: &'a MeshCullingDataBuffer,
    /// The GPU buffer that stores information about the view.
@ -1884,7 +1934,8 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
        let previous_view_buffer = self.previous_view_uniforms.uniforms.buffer()?;
        match (
-            self.indirect_parameters_buffers.indexed_metadata_buffer(),
+            self.phase_indirect_parameters_buffers
                .indexed_metadata_buffer(),
            indexed_work_item_buffer.buffer(),
            late_indexed_work_item_buffer.buffer(),
            self.late_indexed_indirect_parameters_buffer.buffer(),
@ -1975,7 +2026,7 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
        let previous_view_buffer = self.previous_view_uniforms.uniforms.buffer()?;
        match (
-            self.indirect_parameters_buffers
+            self.phase_indirect_parameters_buffers
                .non_indexed_metadata_buffer(),
            non_indexed_work_item_buffer.buffer(),
            late_non_indexed_work_item_buffer.buffer(),
@ -2066,7 +2117,8 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
        let previous_view_buffer = self.previous_view_uniforms.uniforms.buffer()?;
        match (
-            self.indirect_parameters_buffers.indexed_metadata_buffer(),
+            self.phase_indirect_parameters_buffers
                .indexed_metadata_buffer(),
            late_indexed_work_item_buffer.buffer(),
            self.late_indexed_indirect_parameters_buffer.buffer(),
        ) {
@ -2146,7 +2198,7 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
        let previous_view_buffer = self.previous_view_uniforms.uniforms.buffer()?;
        match (
-            self.indirect_parameters_buffers
+            self.phase_indirect_parameters_buffers
                .non_indexed_metadata_buffer(),
            late_non_indexed_work_item_buffer.buffer(),
            self.late_non_indexed_indirect_parameters_buffer.buffer(),
@ -2240,7 +2292,8 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
        let view_uniforms_binding = self.view_uniforms.uniforms.binding()?;
        match (
-            self.indirect_parameters_buffers.indexed_metadata_buffer(),
+            self.phase_indirect_parameters_buffers
                .indexed_metadata_buffer(),
            indexed_work_item_buffer.buffer(),
        ) {
            (Some(indexed_metadata_buffer), Some(indexed_work_item_gpu_buffer)) => {
@ -2293,7 +2346,7 @@ impl<'a> PreprocessBindGroupBuilder<'a> {
        let view_uniforms_binding = self.view_uniforms.uniforms.binding()?;
        match (
-            self.indirect_parameters_buffers
+            self.phase_indirect_parameters_buffers
                .non_indexed_metadata_buffer(),
            non_indexed_work_item_buffer.buffer(),
        ) {
@ -2346,121 +2399,134 @@ fn create_build_indirect_parameters_bind_groups(
    render_device: &RenderDevice,
    pipelines: &PreprocessPipelines,
    current_input_buffer: &Buffer,
-    indirect_parameters_buffer: &IndirectParametersBuffers,
+    indirect_parameters_buffers: &IndirectParametersBuffers,
 ) {
-    commands.insert_resource(BuildIndirectParametersBindGroups {
+    let mut build_indirect_parameters_bind_groups = BuildIndirectParametersBindGroups::new();
        reset_indexed_indirect_batch_sets: match (
            indirect_parameters_buffer.indexed_batch_sets_buffer(),
        ) {
            (Some(indexed_batch_sets_buffer),) => Some(
                render_device.create_bind_group(
                    "reset_indexed_indirect_batch_sets_bind_group",
                    // The early bind group is good for the main phase and late
                    // phase too. They bind the same buffers.
                    &pipelines
                        .early_phase
                        .reset_indirect_batch_sets
                        .bind_group_layout,
                    &BindGroupEntries::sequential((indexed_batch_sets_buffer.as_entire_binding(),)),
                ),
            ),
            _ => None,
        },
-        reset_non_indexed_indirect_batch_sets: match (
+    for (phase_type_id, phase_indirect_parameters_buffer) in indirect_parameters_buffers.iter() {
-            indirect_parameters_buffer.non_indexed_batch_sets_buffer(),
+        build_indirect_parameters_bind_groups.insert(
-        ) {
+            *phase_type_id,
-            (Some(non_indexed_batch_sets_buffer),) => Some(
+            PhaseBuildIndirectParametersBindGroups {
-                render_device.create_bind_group(
+                reset_indexed_indirect_batch_sets: match (
-                    "reset_non_indexed_indirect_batch_sets_bind_group",
+                    phase_indirect_parameters_buffer.indexed_batch_sets_buffer(),
-                    // The early bind group is good for the main phase and late
+                ) {
-                    // phase too. They bind the same buffers.
+                    (Some(indexed_batch_sets_buffer),) => Some(
-                    &pipelines
+                        render_device.create_bind_group(
-                        .early_phase
+                            "reset_indexed_indirect_batch_sets_bind_group",
-                        .reset_indirect_batch_sets
+                            // The early bind group is good for the main phase and late
-                        .bind_group_layout,
+                            // phase too. They bind the same buffers.
-                    &BindGroupEntries::sequential((
+                            &pipelines
-                        non_indexed_batch_sets_buffer.as_entire_binding(),
+                                .early_phase
-                    )),
+                                .reset_indirect_batch_sets
-                ),
+                                .bind_group_layout,
-            ),
+                            &BindGroupEntries::sequential((
-            _ => None,
+                                indexed_batch_sets_buffer.as_entire_binding(),
-        },
+                            )),
                        ),
                    ),
                    _ => None,
                },
-        build_indexed_indirect: match (
+                reset_non_indexed_indirect_batch_sets: match (
-            indirect_parameters_buffer.indexed_metadata_buffer(),
+                    phase_indirect_parameters_buffer.non_indexed_batch_sets_buffer(),
-            indirect_parameters_buffer.indexed_data_buffer(),
+                ) {
-            indirect_parameters_buffer.indexed_batch_sets_buffer(),
+                    (Some(non_indexed_batch_sets_buffer),) => Some(
-        ) {
+                        render_device.create_bind_group(
-            (
+                            "reset_non_indexed_indirect_batch_sets_bind_group",
-                Some(indexed_indirect_parameters_metadata_buffer),
+                            // The early bind group is good for the main phase and late
-                Some(indexed_indirect_parameters_data_buffer),
+                            // phase too. They bind the same buffers.
-                Some(indexed_batch_sets_buffer),
+                            &pipelines
-            ) => Some(
+                                .early_phase
-                render_device.create_bind_group(
+                                .reset_indirect_batch_sets
-                    "build_indexed_indirect_parameters_bind_group",
+                                .bind_group_layout,
-                    // The frustum culling bind group is good for occlusion culling
+                            &BindGroupEntries::sequential((
-                    // too. They bind the same buffers.
+                                non_indexed_batch_sets_buffer.as_entire_binding(),
-                    &pipelines
+                            )),
-                        .gpu_frustum_culling_build_indexed_indirect_params
+                        ),
-                        .bind_group_layout,
+                    ),
-                    &BindGroupEntries::sequential((
+                    _ => None,
-                        current_input_buffer.as_entire_binding(),
+                },
                        // Don't use `as_entire_binding` here; the shader reads
                        // the length and `RawBufferVec` overallocates.
                        BufferBinding {
                            buffer: indexed_indirect_parameters_metadata_buffer,
                            offset: 0,
                            size: NonZeroU64::new(
                                indirect_parameters_buffer.indexed_batch_count() as u64
                                    * size_of::<IndirectParametersMetadata>() as u64,
                            ),
                        },
                        indexed_batch_sets_buffer.as_entire_binding(),
                        indexed_indirect_parameters_data_buffer.as_entire_binding(),
                    )),
                ),
            ),
            _ => None,
        },
-        build_non_indexed_indirect: match (
+                build_indexed_indirect: match (
-            indirect_parameters_buffer.non_indexed_metadata_buffer(),
+                    phase_indirect_parameters_buffer.indexed_metadata_buffer(),
-            indirect_parameters_buffer.non_indexed_data_buffer(),
+                    phase_indirect_parameters_buffer.indexed_data_buffer(),
-            indirect_parameters_buffer.non_indexed_batch_sets_buffer(),
+                    phase_indirect_parameters_buffer.indexed_batch_sets_buffer(),
-        ) {
+                ) {
-            (
+                    (
-                Some(non_indexed_indirect_parameters_metadata_buffer),
+                        Some(indexed_indirect_parameters_metadata_buffer),
-                Some(non_indexed_indirect_parameters_data_buffer),
+                        Some(indexed_indirect_parameters_data_buffer),
-                Some(non_indexed_batch_sets_buffer),
+                        Some(indexed_batch_sets_buffer),
-            ) => Some(
+                    ) => Some(
-                render_device.create_bind_group(
+                        render_device.create_bind_group(
-                    "build_non_indexed_indirect_parameters_bind_group",
+                            "build_indexed_indirect_parameters_bind_group",
-                    // The frustum culling bind group is good for occlusion culling
+                            // The frustum culling bind group is good for occlusion culling
-                    // too. They bind the same buffers.
+                            // too. They bind the same buffers.
-                    &pipelines
+                            &pipelines
-                        .gpu_frustum_culling_build_non_indexed_indirect_params
+                                .gpu_frustum_culling_build_indexed_indirect_params
-                        .bind_group_layout,
+                                .bind_group_layout,
-                    &BindGroupEntries::sequential((
+                            &BindGroupEntries::sequential((
-                        current_input_buffer.as_entire_binding(),
+                                current_input_buffer.as_entire_binding(),
-                        // Don't use `as_entire_binding` here; the shader reads
+                                // Don't use `as_entire_binding` here; the shader reads
-                        // the length and `RawBufferVec` overallocates.
+                                // the length and `RawBufferVec` overallocates.
-                        BufferBinding {
+                                BufferBinding {
-                            buffer: non_indexed_indirect_parameters_metadata_buffer,
+                                    buffer: indexed_indirect_parameters_metadata_buffer,
-                            offset: 0,
+                                    offset: 0,
-                            size: NonZeroU64::new(
+                                    size: NonZeroU64::new(
-                                indirect_parameters_buffer.non_indexed_batch_count() as u64
+                                        phase_indirect_parameters_buffer.indexed_batch_count()
-                                    * size_of::<IndirectParametersMetadata>() as u64,
+                                            as u64
-                            ),
+                                            * size_of::<IndirectParametersMetadata>() as u64,
-                        },
+                                    ),
-                        non_indexed_batch_sets_buffer.as_entire_binding(),
+                                },
-                        non_indexed_indirect_parameters_data_buffer.as_entire_binding(),
+                                indexed_batch_sets_buffer.as_entire_binding(),
-                    )),
+                                indexed_indirect_parameters_data_buffer.as_entire_binding(),
-                ),
+                            )),
-            ),
+                        ),
-            _ => None,
+                    ),
-        },
+                    _ => None,
-    });
+                },
                build_non_indexed_indirect: match (
                    phase_indirect_parameters_buffer.non_indexed_metadata_buffer(),
                    phase_indirect_parameters_buffer.non_indexed_data_buffer(),
                    phase_indirect_parameters_buffer.non_indexed_batch_sets_buffer(),
                ) {
                    (
                        Some(non_indexed_indirect_parameters_metadata_buffer),
                        Some(non_indexed_indirect_parameters_data_buffer),
                        Some(non_indexed_batch_sets_buffer),
                    ) => Some(
                        render_device.create_bind_group(
                            "build_non_indexed_indirect_parameters_bind_group",
                            // The frustum culling bind group is good for occlusion culling
                            // too. They bind the same buffers.
                            &pipelines
                                .gpu_frustum_culling_build_non_indexed_indirect_params
                                .bind_group_layout,
                            &BindGroupEntries::sequential((
                                current_input_buffer.as_entire_binding(),
                                // Don't use `as_entire_binding` here; the shader reads
                                // the length and `RawBufferVec` overallocates.
                                BufferBinding {
                                    buffer: non_indexed_indirect_parameters_metadata_buffer,
                                    offset: 0,
                                    size: NonZeroU64::new(
                                        phase_indirect_parameters_buffer.non_indexed_batch_count()
                                            as u64
                                            * size_of::<IndirectParametersMetadata>() as u64,
                                    ),
                                },
                                non_indexed_batch_sets_buffer.as_entire_binding(),
                                non_indexed_indirect_parameters_data_buffer.as_entire_binding(),
                            )),
                        ),
                    ),
                    _ => None,
                },
            },
        );
    }
    commands.insert_resource(build_indirect_parameters_bind_groups);
 }
 /// Writes the information needed to do GPU mesh culling to the GPU.
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@ -21,7 +21,7 @@ use bevy_render::{
        gpu_preprocessing::{
            self, GpuPreprocessingSupport, IndirectBatchSet, IndirectParametersBuffers,
            IndirectParametersIndexed, IndirectParametersMetadata, IndirectParametersNonIndexed,
-            InstanceInputUniformBuffer,
+            InstanceInputUniformBuffer, UntypedPhaseIndirectParametersBuffers,
        },
        no_gpu_preprocessing, GetBatchData, GetFullBatchData, NoAutomaticBatching,
    },
@ -43,7 +43,8 @@ use bevy_render::{
    Extract,
 };
 use bevy_transform::components::GlobalTransform;
-use bevy_utils::{default, Parallel};
+use bevy_utils::{default, Parallel, TypeIdMap};
 use core::any::TypeId;
 use core::mem::size_of;
 use material_bind_groups::MaterialBindingId;
 use render::skin::{self, SkinIndex};
@ -79,13 +80,24 @@ use smallvec::{smallvec, SmallVec};
 use static_assertions::const_assert_eq;
 /// Provides support for rendering 3D meshes.
 #[derive(Default)]
 pub struct MeshRenderPlugin {
    /// Whether we're building [`MeshUniform`]s on GPU.
    ///
    /// This requires compute shader support and so will be forcibly disabled if
    /// the platform doesn't support those.
    pub use_gpu_instance_buffer_builder: bool,
    /// Debugging flags that can optionally be set when constructing the renderer.
    pub debug_flags: RenderDebugFlags,
 }
 impl MeshRenderPlugin {
    /// Creates a new [`MeshRenderPlugin`] with the given debug flags.
    pub fn new(debug_flags: RenderDebugFlags) -> MeshRenderPlugin {
        MeshRenderPlugin {
            use_gpu_instance_buffer_builder: false,
            debug_flags,
        }
    }
 }
 pub const FORWARD_IO_HANDLE: Handle<Shader> = weak_handle!("38111de1-6e35-4dbb-877b-7b6f9334baf6");
@ -166,18 +178,17 @@ impl Plugin for MeshRenderPlugin {
            (no_automatic_skin_batching, no_automatic_morph_batching),
        )
        .add_plugins((
-            BinnedRenderPhasePlugin::<Opaque3d, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<Opaque3d, MeshPipeline>::new(self.debug_flags),
-            BinnedRenderPhasePlugin::<AlphaMask3d, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<AlphaMask3d, MeshPipeline>::new(self.debug_flags),
-            BinnedRenderPhasePlugin::<Shadow, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<Shadow, MeshPipeline>::new(self.debug_flags),
-            BinnedRenderPhasePlugin::<Opaque3dDeferred, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<Opaque3dDeferred, MeshPipeline>::new(self.debug_flags),
-            BinnedRenderPhasePlugin::<AlphaMask3dDeferred, MeshPipeline>::default(),
+            BinnedRenderPhasePlugin::<AlphaMask3dDeferred, MeshPipeline>::new(self.debug_flags),
-            SortedRenderPhasePlugin::<Transmissive3d, MeshPipeline>::default(),
+            SortedRenderPhasePlugin::<Transmissive3d, MeshPipeline>::new(self.debug_flags),
-            SortedRenderPhasePlugin::<Transparent3d, MeshPipeline>::default(),
+            SortedRenderPhasePlugin::<Transparent3d, MeshPipeline>::new(self.debug_flags),
        ));
        if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
            render_app
                .init_resource::<MeshBindGroups>()
                .init_resource::<SkinIndices>()
                .init_resource::<MorphUniforms>()
                .init_resource::<MorphIndices>()
@ -202,7 +213,7 @@ impl Plugin for MeshRenderPlugin {
                        set_mesh_motion_vector_flags.in_set(RenderSet::PrepareMeshes),
                        prepare_skins.in_set(RenderSet::PrepareResources),
                        prepare_morphs.in_set(RenderSet::PrepareResources),
-                        prepare_mesh_bind_group.in_set(RenderSet::PrepareBindGroups),
+                        prepare_mesh_bind_groups.in_set(RenderSet::PrepareBindGroups),
                        prepare_mesh_view_bind_groups
                            .in_set(RenderSet::PrepareBindGroups)
                            .after(prepare_oit_buffers),
@ -238,12 +249,14 @@ impl Plugin for MeshRenderPlugin {
            if use_gpu_instance_buffer_builder {
                render_app
-                    .init_resource::<gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>()
+                    .init_resource::<gpu_preprocessing::BatchedInstanceBuffers<
                        MeshUniform,
                        MeshInputUniform
                    >>()
                    .init_resource::<RenderMeshInstanceGpuQueues>()
                    .add_systems(
                        ExtractSchedule,
-                        extract_meshes_for_gpu_building
+                        extract_meshes_for_gpu_building.in_set(ExtractMeshesSet),
                            .in_set(ExtractMeshesSet),
                    )
                    .add_systems(
                        Render,
@ -1956,7 +1969,7 @@ impl GetFullBatchData for MeshPipeline {
        indexed: bool,
        base_output_index: u32,
        batch_set_index: Option<NonMaxU32>,
-        indirect_parameters_buffer: &mut IndirectParametersBuffers,
+        phase_indirect_parameters_buffers: &mut UntypedPhaseIndirectParametersBuffers,
        indirect_parameters_offset: u32,
    ) {
        let indirect_parameters = IndirectParametersMetadata {
@ -1971,9 +1984,10 @@ impl GetFullBatchData for MeshPipeline {
        };
        if indexed {
-            indirect_parameters_buffer.set_indexed(indirect_parameters_offset, indirect_parameters);
+            phase_indirect_parameters_buffers
                .set_indexed(indirect_parameters_offset, indirect_parameters);
        } else {
-            indirect_parameters_buffer
+            phase_indirect_parameters_buffers
                .set_non_indexed(indirect_parameters_offset, indirect_parameters);
        }
    }
@ -2567,9 +2581,12 @@ impl SpecializedMeshPipeline for MeshPipeline {
    }
 }
-/// Bind groups for meshes currently loaded.
+/// The bind groups for meshes currently loaded.
-#[derive(Resource, Default)]
+///
-pub struct MeshBindGroups {
+/// If GPU mesh preprocessing isn't in use, these are global to the scene. If
 /// GPU mesh preprocessing is in use, these are specific to a single phase.
 #[derive(Default)]
 pub struct MeshPhaseBindGroups {
    model_only: Option<BindGroup>,
    skinned: Option<MeshBindGroupPair>,
    morph_targets: HashMap<AssetId<Mesh>, MeshBindGroupPair>,
@ -2581,7 +2598,18 @@ pub struct MeshBindGroupPair {
    no_motion_vectors: BindGroup,
 }
-impl MeshBindGroups {
+/// All bind groups for meshes currently loaded.
 #[derive(Resource)]
 pub enum MeshBindGroups {
    /// The bind groups for the meshes for the entire scene, if GPU mesh
    /// preprocessing isn't in use.
    CpuPreprocessing(MeshPhaseBindGroups),
    /// A mapping from the type ID of a phase (e.g. [`Opaque3d`]) to the mesh
    /// bind groups for that phase.
    GpuPreprocessing(TypeIdMap<MeshPhaseBindGroups>),
 }
 impl MeshPhaseBindGroups {
    pub fn reset(&mut self) {
        self.model_only = None;
        self.skinned = None;
@ -2623,9 +2651,10 @@ impl MeshBindGroupPair {
    }
 }
-pub fn prepare_mesh_bind_group(
+/// Creates the per-mesh bind groups for each type of mesh and each phase.
 pub fn prepare_mesh_bind_groups(
    mut commands: Commands,
    meshes: Res<RenderAssets<RenderMesh>>,
    mut groups: ResMut<MeshBindGroups>,
    mesh_pipeline: Res<MeshPipeline>,
    render_device: Res<RenderDevice>,
    cpu_batched_instance_buffer: Option<
@ -2638,24 +2667,80 @@ pub fn prepare_mesh_bind_group(
    weights_uniform: Res<MorphUniforms>,
    mut render_lightmaps: ResMut<RenderLightmaps>,
 ) {
-    groups.reset();
+    // CPU mesh preprocessing path.
    if let Some(cpu_batched_instance_buffer) = cpu_batched_instance_buffer {
        if let Some(instance_data_binding) = cpu_batched_instance_buffer
            .into_inner()
            .instance_data_binding()
        {
            // In this path, we only have a single set of bind groups for all phases.
            let cpu_preprocessing_mesh_bind_groups = prepare_mesh_bind_groups_for_phase(
                instance_data_binding,
                &meshes,
                &mesh_pipeline,
                &render_device,
                &skins_uniform,
                &weights_uniform,
                &mut render_lightmaps,
            );
            commands.insert_resource(MeshBindGroups::CpuPreprocessing(
                cpu_preprocessing_mesh_bind_groups,
            ));
            return;
        }
    }
    // GPU mesh preprocessing path.
    if let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
        let mut gpu_preprocessing_mesh_bind_groups = TypeIdMap::default();
        // Loop over each phase.
        for (phase_type_id, batched_phase_instance_buffers) in
            &gpu_batched_instance_buffers.phase_instance_buffers
        {
            let Some(instance_data_binding) =
                batched_phase_instance_buffers.instance_data_binding()
            else {
                continue;
            };
            let mesh_phase_bind_groups = prepare_mesh_bind_groups_for_phase(
                instance_data_binding,
                &meshes,
                &mesh_pipeline,
                &render_device,
                &skins_uniform,
                &weights_uniform,
                &mut render_lightmaps,
            );
            gpu_preprocessing_mesh_bind_groups.insert(*phase_type_id, mesh_phase_bind_groups);
        }
        commands.insert_resource(MeshBindGroups::GpuPreprocessing(
            gpu_preprocessing_mesh_bind_groups,
        ));
    }
 }
 /// Creates the per-mesh bind groups for each type of mesh, for a single phase.
 fn prepare_mesh_bind_groups_for_phase(
    model: BindingResource,
    meshes: &RenderAssets<RenderMesh>,
    mesh_pipeline: &MeshPipeline,
    render_device: &RenderDevice,
    skins_uniform: &SkinUniforms,
    weights_uniform: &MorphUniforms,
    render_lightmaps: &mut RenderLightmaps,
 ) -> MeshPhaseBindGroups {
    let layouts = &mesh_pipeline.mesh_layouts;
-    let model = if let Some(cpu_batched_instance_buffer) = cpu_batched_instance_buffer {
+    // TODO: Reuse allocations.
-        cpu_batched_instance_buffer
+    let mut groups = MeshPhaseBindGroups {
-            .into_inner()
+        model_only: Some(layouts.model_only(render_device, &model)),
-            .instance_data_binding()
+        ..default()
    } else if let Some(gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
        gpu_batched_instance_buffers
            .into_inner()
            .instance_data_binding()
    } else {
        return;
    };
    let Some(model) = model else { return };
    groups.model_only = Some(layouts.model_only(&render_device, &model));
    // Create the skinned mesh bind group with the current and previous buffers
    // (the latter being for motion vector computation). If there's no previous
@ -2664,8 +2749,8 @@ pub fn prepare_mesh_bind_group(
    if let Some(skin) = skin {
        let prev_skin = skins_uniform.prev_buffer.buffer().unwrap_or(skin);
        groups.skinned = Some(MeshBindGroupPair {
-            motion_vectors: layouts.skinned_motion(&render_device, &model, skin, prev_skin),
+            motion_vectors: layouts.skinned_motion(render_device, &model, skin, prev_skin),
-            no_motion_vectors: layouts.skinned(&render_device, &model, skin),
+            no_motion_vectors: layouts.skinned(render_device, &model, skin),
        });
    }
@ -2680,7 +2765,7 @@ pub fn prepare_mesh_bind_group(
                        let prev_skin = skins_uniform.prev_buffer.buffer().unwrap_or(skin);
                        MeshBindGroupPair {
                            motion_vectors: layouts.morphed_skinned_motion(
-                                &render_device,
+                                render_device,
                                &model,
                                skin,
                                weights,
@ -2689,7 +2774,7 @@ pub fn prepare_mesh_bind_group(
                                prev_weights,
                            ),
                            no_motion_vectors: layouts.morphed_skinned(
-                                &render_device,
+                                render_device,
                                &model,
                                skin,
                                weights,
@ -2699,18 +2784,13 @@ pub fn prepare_mesh_bind_group(
                    }
                    None => MeshBindGroupPair {
                        motion_vectors: layouts.morphed_motion(
-                            &render_device,
+                            render_device,
                            &model,
                            weights,
                            targets,
                            prev_weights,
                        ),
-                        no_motion_vectors: layouts.morphed(
+                        no_motion_vectors: layouts.morphed(render_device, &model, weights, targets),
                            &render_device,
                            &model,
                            weights,
                            targets,
                        ),
                    },
                };
                groups.morph_targets.insert(id, bind_group_pair);
@ -2723,9 +2803,11 @@ pub fn prepare_mesh_bind_group(
    for (lightmap_slab_id, lightmap_slab) in render_lightmaps.slabs.iter_mut().enumerate() {
        groups.lightmaps.insert(
            LightmapSlabIndex(NonMaxU32::new(lightmap_slab_id as u32).unwrap()),
-            layouts.lightmapped(&render_device, &model, lightmap_slab, bindless_supported),
+            layouts.lightmapped(render_device, &model, lightmap_slab, bindless_supported),
        );
    }
    groups
 }
 pub struct SetMeshViewBindGroup<const I: usize>;
@ -2829,7 +2911,20 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {
            .get(entity)
            .map(|render_lightmap| render_lightmap.slab_index);
-        let Some(bind_group) = bind_groups.get(
+        let Some(mesh_phase_bind_groups) = (match *bind_groups {
            MeshBindGroups::CpuPreprocessing(ref mesh_phase_bind_groups) => {
                Some(mesh_phase_bind_groups)
            }
            MeshBindGroups::GpuPreprocessing(ref mesh_phase_bind_groups) => {
                mesh_phase_bind_groups.get(&TypeId::of::<P>())
            }
        }) else {
            // This is harmless if e.g. we're rendering the `Shadow` phase and
            // there weren't any shadows.
            return RenderCommandResult::Success;
        };
        let Some(bind_group) = mesh_phase_bind_groups.get(
            mesh_asset_id,
            lightmap_slab_index,
            is_skinned,
@ -2981,9 +3076,18 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
                        // Look up the indirect parameters buffer, as well as
                        // the buffer we're going to use for
                        // `multi_draw_indexed_indirect_count` (if available).
                        let Some(phase_indirect_parameters_buffers) =
                            indirect_parameters_buffer.get(&TypeId::of::<P>())
                        else {
                            warn!(
                                "Not rendering mesh because indexed indirect parameters buffer \
                                 wasn't present for this phase",
                            );
                            return RenderCommandResult::Skip;
                        };
                        let (Some(indirect_parameters_buffer), Some(batch_sets_buffer)) = (
-                            indirect_parameters_buffer.indexed_data_buffer(),
+                            phase_indirect_parameters_buffers.indexed_data_buffer(),
-                            indirect_parameters_buffer.indexed_batch_sets_buffer(),
+                            phase_indirect_parameters_buffers.indexed_batch_sets_buffer(),
                        ) else {
                            warn!(
                                "Not rendering mesh because indexed indirect parameters buffer \
@ -3038,9 +3142,18 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
                    // Look up the indirect parameters buffer, as well as the
                    // buffer we're going to use for
                    // `multi_draw_indirect_count` (if available).
                    let Some(phase_indirect_parameters_buffers) =
                        indirect_parameters_buffer.get(&TypeId::of::<P>())
                    else {
                        warn!(
                            "Not rendering mesh because indexed indirect parameters buffer \
                                 wasn't present for this phase",
                        );
                        return RenderCommandResult::Skip;
                    };
                    let (Some(indirect_parameters_buffer), Some(batch_sets_buffer)) = (
-                        indirect_parameters_buffer.non_indexed_data_buffer(),
+                        phase_indirect_parameters_buffers.non_indexed_data_buffer(),
-                        indirect_parameters_buffer.non_indexed_batch_sets_buffer(),
+                        phase_indirect_parameters_buffers.non_indexed_batch_sets_buffer(),
                    ) else {
                        warn!(
                            "Not rendering mesh because non-indexed indirect parameters buffer \
--- a/crates/bevy_render/Cargo.toml
+++ b/crates/bevy_render/Cargo.toml
@ -101,6 +101,7 @@ variadics_please = "1.1"
 tracing = { version = "0.1", default-features = false, features = ["std"] }
 indexmap = { version = "2" }
 fixedbitset = { version = "0.5" }
 bitflags = "2"
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
 # Omit the `glsl` feature in non-WebAssembly by default.
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@ -1,8 +1,9 @@
 //! Batching functionality when GPU preprocessing is in use.
-use core::any::TypeId;
+use core::{any::TypeId, marker::PhantomData, mem};
 use bevy_app::{App, Plugin};
 use bevy_derive::{Deref, DerefMut};
 use bevy_ecs::{
    prelude::Entity,
    query::{Has, With},
@ -24,26 +25,22 @@ use crate::{
    experimental::occlusion_culling::OcclusionCulling,
    render_phase::{
        BinnedPhaseItem, BinnedRenderPhaseBatch, BinnedRenderPhaseBatchSet,
-        BinnedRenderPhaseBatchSets, CachedRenderPipelinePhaseItem, InputUniformIndex,
+        BinnedRenderPhaseBatchSets, CachedRenderPipelinePhaseItem, InputUniformIndex, PhaseItem,
        PhaseItemBatchSetKey as _, PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase,
        UnbatchableBinnedEntityIndices, ViewBinnedRenderPhases, ViewSortedRenderPhases,
    },
    render_resource::{Buffer, BufferVec, GpuArrayBufferable, RawBufferVec, UninitBufferVec},
    renderer::{RenderAdapter, RenderDevice, RenderQueue},
    view::{ExtractedView, NoIndirectDrawing, RetainedViewEntity},
-    Render, RenderApp, RenderSet,
+    Render, RenderApp, RenderDebugFlags, RenderSet,
 };
 use super::{BatchMeta, GetBatchData, GetFullBatchData};
 #[derive(Default)]
 pub struct BatchingPlugin {
-    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
+    /// Debugging flags that can optionally be set when constructing the renderer.
-    /// that they can be read back to CPU.
+    pub debug_flags: RenderDebugFlags,
    ///
    /// This is a debugging feature that may reduce performance. It primarily
    /// exists for the `occlusion_culling` example.
    pub allow_copies_from_indirect_parameters: bool,
 }
 impl Plugin for BatchingPlugin {
@ -54,7 +51,8 @@ impl Plugin for BatchingPlugin {
        render_app
            .insert_resource(IndirectParametersBuffers::new(
-                self.allow_copies_from_indirect_parameters,
+                self.debug_flags
                    .contains(RenderDebugFlags::ALLOW_COPIES_FROM_INDIRECT_PARAMETERS),
            ))
            .add_systems(
                Render,
@ -147,18 +145,6 @@ where
    BD: GpuArrayBufferable + Sync + Send + 'static,
    BDI: Pod + Default,
 {
    /// A storage area for the buffer data that the GPU compute shader is
    /// expected to write to.
    ///
    /// There will be one entry for each index.
    pub data_buffer: UninitBufferVec<BD>,
    /// The index of the buffer data in the current input buffer that
    /// corresponds to each instance.
    ///
    /// This is keyed off each view. Each view has a separate buffer.
    pub work_item_buffers: HashMap<RetainedViewEntity, TypeIdMap<PreprocessWorkItemBuffers>>,
    /// The uniform data inputs for the current frame.
    ///
    /// These are uploaded during the extraction phase.
@ -173,6 +159,81 @@ where
    /// corresponding buffer data input uniform in this list.
    pub previous_input_buffer: InstanceInputUniformBuffer<BDI>,
    /// The data needed to render buffers for each phase.
    ///
    /// The keys of this map are the type IDs of each phase: e.g. `Opaque3d`,
    /// `AlphaMask3d`, etc.
    pub phase_instance_buffers: TypeIdMap<UntypedPhaseBatchedInstanceBuffers<BD>>,
 }
 impl<BD, BDI> Default for BatchedInstanceBuffers<BD, BDI>
 where
    BD: GpuArrayBufferable + Sync + Send + 'static,
    BDI: Pod + Sync + Send + Default + 'static,
 {
    fn default() -> Self {
        BatchedInstanceBuffers {
            current_input_buffer: InstanceInputUniformBuffer::new(),
            previous_input_buffer: InstanceInputUniformBuffer::new(),
            phase_instance_buffers: HashMap::default(),
        }
    }
 }
 /// The GPU buffers holding the data needed to render batches for a single
 /// phase.
 ///
 /// These are split out per phase so that we can run the phases in parallel.
 /// This is the version of the structure that has a type parameter, which
 /// enables Bevy's scheduler to run the batching operations for the different
 /// phases in parallel.
 ///
 /// See the documentation for [`BatchedInstanceBuffers`] for more information.
 #[derive(Resource)]
 pub struct PhaseBatchedInstanceBuffers<PI, BD>
 where
    PI: PhaseItem,
    BD: GpuArrayBufferable + Sync + Send + 'static,
 {
    /// The buffers for this phase.
    pub buffers: UntypedPhaseBatchedInstanceBuffers<BD>,
    phantom: PhantomData<PI>,
 }
 impl<PI, BD> Default for PhaseBatchedInstanceBuffers<PI, BD>
 where
    PI: PhaseItem,
    BD: GpuArrayBufferable + Sync + Send + 'static,
 {
    fn default() -> Self {
        PhaseBatchedInstanceBuffers {
            buffers: UntypedPhaseBatchedInstanceBuffers::default(),
            phantom: PhantomData,
        }
    }
 }
 /// The GPU buffers holding the data needed to render batches for a single
 /// phase, without a type parameter for that phase.
 ///
 /// Since this structure doesn't have a type parameter, it can be placed in
 /// [`BatchedInstanceBuffers::phase_instance_buffers`].
 pub struct UntypedPhaseBatchedInstanceBuffers<BD>
 where
    BD: GpuArrayBufferable + Sync + Send + 'static,
 {
    /// A storage area for the buffer data that the GPU compute shader is
    /// expected to write to.
    ///
    /// There will be one entry for each index.
    pub data_buffer: UninitBufferVec<BD>,
    /// The index of the buffer data in the current input buffer that
    /// corresponds to each instance.
    ///
    /// This is keyed off each view. Each view has a separate buffer.
    pub work_item_buffers: HashMap<RetainedViewEntity, PreprocessWorkItemBuffers>,
    /// A buffer that holds the number of indexed meshes that weren't visible in
    /// the previous frame, when GPU occlusion culling is in use.
    ///
@ -351,11 +412,11 @@ pub struct GpuOcclusionCullingWorkItemBuffers {
    /// The buffer of work items corresponding to non-indexed meshes.
    pub late_non_indexed: UninitBufferVec<PreprocessWorkItem>,
    /// The offset into the
-    /// [`BatchedInstanceBuffers::late_indexed_indirect_parameters_buffer`]
+    /// [`UntypedPhaseBatchedInstanceBuffers::late_indexed_indirect_parameters_buffer`]
    /// where this view's indirect dispatch counts for indexed meshes live.
    pub late_indirect_parameters_indexed_offset: u32,
    /// The offset into the
-    /// [`BatchedInstanceBuffers::late_non_indexed_indirect_parameters_buffer`]
+    /// [`UntypedPhaseBatchedInstanceBuffers::late_non_indexed_indirect_parameters_buffer`]
    /// where this view's indirect dispatch counts for non-indexed meshes live.
    pub late_indirect_parameters_non_indexed_offset: u32,
 }
@ -409,7 +470,7 @@ impl Default for LatePreprocessWorkItemIndirectParameters {
 /// You may need to call this function if you're implementing your own custom
 /// render phases. See the `specialized_mesh_pipeline` example.
 pub fn get_or_create_work_item_buffer<'a, I>(
-    work_item_buffers: &'a mut HashMap<RetainedViewEntity, TypeIdMap<PreprocessWorkItemBuffers>>,
+    work_item_buffers: &'a mut HashMap<RetainedViewEntity, PreprocessWorkItemBuffers>,
    view: RetainedViewEntity,
    no_indirect_drawing: bool,
    enable_gpu_occlusion_culling: bool,
@ -417,11 +478,7 @@ pub fn get_or_create_work_item_buffer<'a, I>(
 where
    I: 'static,
 {
-    let preprocess_work_item_buffers = match work_item_buffers
+    let preprocess_work_item_buffers = match work_item_buffers.entry(view) {
        .entry(view)
        .or_default()
        .entry(TypeId::of::<I>())
    {
        Entry::Occupied(occupied_entry) => occupied_entry.into_mut(),
        Entry::Vacant(vacant_entry) => {
            if no_indirect_drawing {
@ -700,8 +757,71 @@ pub struct IndirectBatchSet {
 /// pass can determine how many meshes are actually to be drawn.
 ///
 /// These buffers will remain empty if indirect drawing isn't in use.
-#[derive(Resource)]
+#[derive(Resource, Deref, DerefMut)]
 pub struct IndirectParametersBuffers {
    /// A mapping from a phase type ID to the indirect parameters buffers for
    /// that phase.
    ///
    /// Examples of phase type IDs are `Opaque3d` and `AlphaMask3d`.
    #[deref]
    pub buffers: TypeIdMap<UntypedPhaseIndirectParametersBuffers>,
    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
    /// that they can be read back to CPU.
    ///
    /// This is a debugging feature that may reduce performance. It primarily
    /// exists for the `occlusion_culling` example.
    pub allow_copies_from_indirect_parameter_buffers: bool,
 }
 impl IndirectParametersBuffers {
    /// Initializes a new [`IndirectParametersBuffers`] resource.
    pub fn new(allow_copies_from_indirect_parameter_buffers: bool) -> IndirectParametersBuffers {
        IndirectParametersBuffers {
            buffers: TypeIdMap::default(),
            allow_copies_from_indirect_parameter_buffers,
        }
    }
 }
 /// The buffers containing all the information that indirect draw commands use
 /// to draw the scene, for a single phase.
 ///
 /// This is the version of the structure that has a type parameter, so that the
 /// batching for different phases can run in parallel.
 ///
 /// See the [`IndirectParametersBuffers`] documentation for more information.
 #[derive(Resource)]
 pub struct PhaseIndirectParametersBuffers<PI>
 where
    PI: PhaseItem,
 {
    /// The indirect draw buffers for the phase.
    pub buffers: UntypedPhaseIndirectParametersBuffers,
    phantom: PhantomData<PI>,
 }
 impl<PI> PhaseIndirectParametersBuffers<PI>
 where
    PI: PhaseItem,
 {
    pub fn new(allow_copies_from_indirect_parameter_buffers: bool) -> Self {
        PhaseIndirectParametersBuffers {
            buffers: UntypedPhaseIndirectParametersBuffers::new(
                allow_copies_from_indirect_parameter_buffers,
            ),
            phantom: PhantomData,
        }
    }
 }
 /// The buffers containing all the information that indirect draw commands use
 /// to draw the scene, for a single phase.
 ///
 /// This is the version of the structure that doesn't have a type parameter, so
 /// that it can be inserted into [`IndirectParametersBuffers::buffers`]
 ///
 /// See the [`IndirectParametersBuffers`] documentation for more information.
 pub struct UntypedPhaseIndirectParametersBuffers {
    /// The GPU buffer that stores the indirect draw parameters for non-indexed
    /// meshes.
    ///
@ -751,15 +871,17 @@ pub struct IndirectParametersBuffers {
    indexed_batch_sets: RawBufferVec<IndirectBatchSet>,
 }
-impl IndirectParametersBuffers {
+impl UntypedPhaseIndirectParametersBuffers {
    /// Creates the indirect parameters buffers.
-    pub fn new(allow_copies_from_indirect_parameter_buffers: bool) -> IndirectParametersBuffers {
+    pub fn new(
        allow_copies_from_indirect_parameter_buffers: bool,
    ) -> UntypedPhaseIndirectParametersBuffers {
        let mut indirect_parameter_buffer_usages = BufferUsages::STORAGE | BufferUsages::INDIRECT;
        if allow_copies_from_indirect_parameter_buffers {
            indirect_parameter_buffer_usages |= BufferUsages::COPY_SRC;
        }
-        IndirectParametersBuffers {
+        UntypedPhaseIndirectParametersBuffers {
            non_indexed_data: UninitBufferVec::new(indirect_parameter_buffer_usages),
            non_indexed_metadata: RawBufferVec::new(BufferUsages::STORAGE),
            non_indexed_batch_sets: RawBufferVec::new(indirect_parameter_buffer_usages),
@ -952,6 +1074,15 @@ impl IndirectParametersBuffers {
    pub fn get_next_batch_set_index(&self, indexed: bool) -> Option<NonMaxU32> {
        NonMaxU32::new(self.batch_set_count(indexed) as u32)
    }
    pub fn clear(&mut self) {
        self.indexed_data.clear();
        self.indexed_metadata.clear();
        self.indexed_batch_sets.clear();
        self.non_indexed_data.clear();
        self.non_indexed_metadata.clear();
        self.non_indexed_batch_sets.clear();
    }
 }
 impl Default for IndirectParametersBuffers {
@ -1007,11 +1138,24 @@ where
 {
    /// Creates new buffers.
    pub fn new() -> Self {
-        BatchedInstanceBuffers {
+        Self::default()
    }
    /// Clears out the buffers in preparation for a new frame.
    pub fn clear(&mut self) {
        // TODO: Don't do this.
        self.phase_instance_buffers.clear();
    }
 }
 impl<BD> UntypedPhaseBatchedInstanceBuffers<BD>
 where
    BD: GpuArrayBufferable + Sync + Send + 'static,
 {
    pub fn new() -> Self {
        UntypedPhaseBatchedInstanceBuffers {
            data_buffer: UninitBufferVec::new(BufferUsages::STORAGE),
            work_item_buffers: HashMap::default(),
            current_input_buffer: InstanceInputUniformBuffer::new(),
            previous_input_buffer: InstanceInputUniformBuffer::new(),
            late_indexed_indirect_parameters_buffer: RawBufferVec::new(
                BufferUsages::STORAGE | BufferUsages::INDIRECT,
            ),
@ -1039,17 +1183,14 @@ where
        // Clear each individual set of buffers, but don't depopulate the hash
        // table. We want to avoid reallocating these vectors every frame.
        for view_work_item_buffers in self.work_item_buffers.values_mut() {
-            for phase_work_item_buffers in view_work_item_buffers.values_mut() {
+            view_work_item_buffers.clear();
                phase_work_item_buffers.clear();
            }
        }
    }
 }
-impl<BD, BDI> Default for BatchedInstanceBuffers<BD, BDI>
+impl<BD> Default for UntypedPhaseBatchedInstanceBuffers<BD>
 where
    BD: GpuArrayBufferable + Sync + Send + 'static,
    BDI: Pod + Default + Sync + Send + 'static,
 {
    fn default() -> Self {
        Self::new()
@ -1098,7 +1239,7 @@ where
        self,
        instance_end_index: u32,
        phase: &mut SortedRenderPhase<I>,
-        indirect_parameters_buffers: &mut IndirectParametersBuffers,
+        phase_indirect_parameters_buffers: &mut UntypedPhaseIndirectParametersBuffers,
    ) where
        I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
    {
@ -1114,7 +1255,7 @@ where
            None => PhaseItemExtraIndex::None,
        };
        if let Some(indirect_parameters_index) = self.indirect_parameters_index {
-            indirect_parameters_buffers
+            phase_indirect_parameters_buffers
                .add_batch_set(self.indexed, indirect_parameters_index.into());
        }
    }
@ -1156,17 +1297,23 @@ pub fn delete_old_work_item_buffers<GFBD>(
        .iter()
        .map(|extracted_view| extracted_view.retained_view_entity)
        .collect();
-    gpu_batched_instance_buffers
+    for phase_instance_buffers in gpu_batched_instance_buffers
-        .work_item_buffers
+        .phase_instance_buffers
-        .retain(|retained_view_entity, _| retained_view_entities.contains(retained_view_entity));
+        .values_mut()
    {
        phase_instance_buffers
            .work_item_buffers
            .retain(|retained_view_entity, _| {
                retained_view_entities.contains(retained_view_entity)
            });
    }
 }
 /// Batch the items in a sorted render phase, when GPU instance buffer building
 /// is in use. This means comparing metadata needed to draw each phase item and
 /// trying to combine the draws into a batch.
 pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
-    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    indirect_parameters_buffers: Res<IndirectParametersBuffers>,
    mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
    mut sorted_render_phases: ResMut<ViewSortedRenderPhases<I>>,
    mut views: Query<(
        &ExtractedView,
@ -1178,14 +1325,19 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
    I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
    GFBD: GetFullBatchData,
 {
    let mut phase_batched_instance_buffers =
        UntypedPhaseBatchedInstanceBuffers::<GFBD::BufferData>::new();
    let mut phase_indirect_parameters_buffers = UntypedPhaseIndirectParametersBuffers::new(
        indirect_parameters_buffers.allow_copies_from_indirect_parameter_buffers,
    );
    // We only process GPU-built batch data in this function.
-    let BatchedInstanceBuffers {
+    let UntypedPhaseBatchedInstanceBuffers {
        ref mut data_buffer,
        ref mut work_item_buffers,
        ref mut late_indexed_indirect_parameters_buffer,
        ref mut late_non_indexed_indirect_parameters_buffer,
-        ..
+    } = phase_batched_instance_buffers;
    } = gpu_array_buffer.into_inner();
    for (extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views {
        let Some(phase) = sorted_render_phases.get_mut(&extracted_view.retained_view_entity) else {
@ -1231,7 +1383,7 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
                    batch.flush(
                        data_buffer.len() as u32,
                        phase,
-                        &mut indirect_parameters_buffers,
+                        &mut phase_indirect_parameters_buffers,
                    );
                }
@ -1257,15 +1409,15 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
            if !can_batch {
                // Break a batch if we need to.
                if let Some(batch) = batch.take() {
-                    batch.flush(output_index, phase, &mut indirect_parameters_buffers);
+                    batch.flush(output_index, phase, &mut phase_indirect_parameters_buffers);
                }
                let indirect_parameters_index = if no_indirect_drawing {
                    None
                } else if item_is_indexed {
-                    Some(indirect_parameters_buffers.allocate_indexed(1))
+                    Some(phase_indirect_parameters_buffers.allocate_indexed(1))
                } else {
-                    Some(indirect_parameters_buffers.allocate_non_indexed(1))
+                    Some(phase_indirect_parameters_buffers.allocate_non_indexed(1))
                };
                // Start a new batch.
@ -1275,7 +1427,7 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
                        item_is_indexed,
                        output_index,
                        None,
-                        &mut indirect_parameters_buffers,
+                        &mut phase_indirect_parameters_buffers,
                        indirect_parameters_index,
                    );
                };
@ -1317,7 +1469,7 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
            batch.flush(
                data_buffer.len() as u32,
                phase,
-                &mut indirect_parameters_buffers,
+                &mut phase_indirect_parameters_buffers,
            );
        }
    }
@ -1325,8 +1477,8 @@ pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
 /// Creates batches for a render phase that uses bins.
 pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
-    gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
+    mut phase_batched_instance_buffers: ResMut<PhaseBatchedInstanceBuffers<BPI, GFBD::BufferData>>,
-    mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
+    mut phase_indirect_parameters_buffers: ResMut<PhaseIndirectParametersBuffers<BPI>>,
    mut binned_render_phases: ResMut<ViewBinnedRenderPhases<BPI>>,
    mut views: Query<
        (
@ -1343,13 +1495,12 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
 {
    let system_param_item = param.into_inner();
-    let BatchedInstanceBuffers {
+    let UntypedPhaseBatchedInstanceBuffers {
        ref mut data_buffer,
        ref mut work_item_buffers,
        ref mut late_indexed_indirect_parameters_buffer,
        ref mut late_non_indexed_indirect_parameters_buffer,
-        ..
+    } = phase_batched_instance_buffers.buffers;
    } = gpu_array_buffer.into_inner();
    for (extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views {
        let Some(phase) = binned_render_phases.get_mut(&extracted_view.retained_view_entity) else {
@ -1376,8 +1527,10 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
        for (batch_set_key, bins) in &phase.multidrawable_meshes {
            let mut batch_set = None;
-            let indirect_parameters_base =
+            let indirect_parameters_base = phase_indirect_parameters_buffers
-                indirect_parameters_buffers.batch_count(batch_set_key.indexed()) as u32;
+                .buffers
                .batch_count(batch_set_key.indexed())
                as u32;
            for (bin_key, bin) in bins {
                let first_output_index = data_buffer.len() as u32;
                let mut batch: Option<BinnedRenderPhaseBatch> = None;
@ -1408,9 +1561,11 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                        None => {
                            // Start a new batch, in indirect mode.
-                            let indirect_parameters_index =
+                            let indirect_parameters_index = phase_indirect_parameters_buffers
-                                indirect_parameters_buffers.allocate(batch_set_key.indexed(), 1);
+                                .buffers
-                            let batch_set_index = indirect_parameters_buffers
+                                .allocate(batch_set_key.indexed(), 1);
                            let batch_set_index = phase_indirect_parameters_buffers
                                .buffers
                                .get_next_batch_set_index(batch_set_key.indexed());
                            GFBD::write_batch_indirect_parameters_metadata(
@ -1418,7 +1573,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                                batch_set_key.indexed(),
                                output_index,
                                batch_set_index,
-                                &mut indirect_parameters_buffers,
+                                &mut phase_indirect_parameters_buffers.buffers,
                                indirect_parameters_index,
                            );
                            work_item_buffer.push(
@ -1447,7 +1602,8 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                                first_batch: batch,
                                batch_count: 1,
                                bin_key: bin_key.clone(),
-                                index: indirect_parameters_buffers
+                                index: phase_indirect_parameters_buffers
                                    .buffers
                                    .batch_set_count(batch_set_key.indexed())
                                    as u32,
                            });
@ -1464,7 +1620,8 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
            {
                if let Some(batch_set) = batch_set {
                    batch_sets.push(batch_set);
-                    indirect_parameters_buffers
+                    phase_indirect_parameters_buffers
                        .buffers
                        .add_batch_set(batch_set_key.indexed(), indirect_parameters_base);
                }
            }
@ -1513,17 +1670,19 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                    None if !no_indirect_drawing => {
                        // Start a new batch, in indirect mode.
-                        let indirect_parameters_index =
+                        let indirect_parameters_index = phase_indirect_parameters_buffers
-                            indirect_parameters_buffers.allocate(key.0.indexed(), 1);
+                            .buffers
-                        let batch_set_index =
+                            .allocate(key.0.indexed(), 1);
-                            indirect_parameters_buffers.get_next_batch_set_index(key.0.indexed());
+                        let batch_set_index = phase_indirect_parameters_buffers
                            .buffers
                            .get_next_batch_set_index(key.0.indexed());
                        GFBD::write_batch_indirect_parameters_metadata(
                            input_index,
                            key.0.indexed(),
                            output_index,
                            batch_set_index,
-                            &mut indirect_parameters_buffers,
+                            &mut phase_indirect_parameters_buffers.buffers,
                            indirect_parameters_index,
                        );
                        work_item_buffer.push(
@ -1580,7 +1739,9 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                            first_batch: batch,
                            batch_count: 1,
                            bin_key: key.1.clone(),
-                            index: indirect_parameters_buffers.batch_set_count(key.0.indexed())
+                            index: phase_indirect_parameters_buffers
                                .buffers
                                .batch_set_count(key.0.indexed())
                                as u32,
                        });
                    }
@ -1595,12 +1756,14 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                None
            } else if key.0.indexed() {
                Some(
-                    indirect_parameters_buffers
+                    phase_indirect_parameters_buffers
                        .buffers
                        .allocate_indexed(unbatchables.entities.len() as u32),
                )
            } else {
                Some(
-                    indirect_parameters_buffers
+                    phase_indirect_parameters_buffers
                        .buffers
                        .allocate_non_indexed(unbatchables.entities.len() as u32),
                )
            };
@ -1620,7 +1783,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                        key.0.indexed(),
                        output_index,
                        None,
-                        &mut indirect_parameters_buffers,
+                        &mut phase_indirect_parameters_buffers.buffers,
                        *indirect_parameters_index,
                    );
                    work_item_buffer.push(
@ -1640,7 +1803,8 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                                batch_set_index: None,
                            },
                        });
-                    indirect_parameters_buffers
+                    phase_indirect_parameters_buffers
                        .buffers
                        .add_batch_set(key.0.indexed(), *indirect_parameters_index);
                    *indirect_parameters_index += 1;
                } else {
@ -1664,6 +1828,64 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
    }
 }
 /// A system that gathers up the per-phase GPU buffers and inserts them into the
 /// [`BatchedInstanceBuffers`] and [`IndirectParametersBuffers`] tables.
 ///
 /// This runs after the [`batch_and_prepare_binned_render_phase`] or
 /// [`batch_and_prepare_sorted_render_phase`] systems. It takes the per-phase
 /// [`PhaseBatchedInstanceBuffers`] and [`PhaseIndirectParametersBuffers`]
 /// resources and inserts them into the global [`BatchedInstanceBuffers`] and
 /// [`IndirectParametersBuffers`] tables.
 ///
 /// This system exists so that the [`batch_and_prepare_binned_render_phase`] and
 /// [`batch_and_prepare_sorted_render_phase`] can run in parallel with one
 /// another. If those two systems manipulated [`BatchedInstanceBuffers`] and
 /// [`IndirectParametersBuffers`] directly, then they wouldn't be able to run in
 /// parallel.
 pub fn collect_buffers_for_phase<PI, GFBD>(
    mut phase_batched_instance_buffers: ResMut<PhaseBatchedInstanceBuffers<PI, GFBD::BufferData>>,
    mut phase_indirect_parameters_buffers: ResMut<PhaseIndirectParametersBuffers<PI>>,
    mut batched_instance_buffers: ResMut<
        BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
    >,
    mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
 ) where
    PI: PhaseItem,
    GFBD: GetFullBatchData + Send + Sync + 'static,
 {
    // Insert the `PhaseBatchedInstanceBuffers` into the global table. Replace
    // the contents of the per-phase resource with the old batched instance
    // buffers in order to reuse allocations.
    let untyped_phase_batched_instance_buffers =
        mem::take(&mut phase_batched_instance_buffers.buffers);
    if let Some(mut old_untyped_phase_batched_instance_buffers) = batched_instance_buffers
        .phase_instance_buffers
        .insert(TypeId::of::<PI>(), untyped_phase_batched_instance_buffers)
    {
        old_untyped_phase_batched_instance_buffers.clear();
        phase_batched_instance_buffers.buffers = old_untyped_phase_batched_instance_buffers;
    }
    // Insert the `PhaseIndirectParametersBuffers` into the global table.
    // Replace the contents of the per-phase resource with the old indirect
    // parameters buffers in order to reuse allocations.
    let untyped_phase_indirect_parameters_buffers = mem::replace(
        &mut phase_indirect_parameters_buffers.buffers,
        UntypedPhaseIndirectParametersBuffers::new(
            indirect_parameters_buffers.allow_copies_from_indirect_parameter_buffers,
        ),
    );
    if let Some(mut old_untyped_phase_indirect_parameters_buffers) = indirect_parameters_buffers
        .insert(
            TypeId::of::<PI>(),
            untyped_phase_indirect_parameters_buffers,
        )
    {
        old_untyped_phase_indirect_parameters_buffers.clear();
        phase_indirect_parameters_buffers.buffers = old_untyped_phase_indirect_parameters_buffers;
    }
 }
 /// A system that writes all instance buffers to the GPU.
 pub fn write_batched_instance_buffers<GFBD>(
    render_device: Res<RenderDevice>,
@ -1673,26 +1895,31 @@ pub fn write_batched_instance_buffers<GFBD>(
    GFBD: GetFullBatchData,
 {
    let BatchedInstanceBuffers {
        ref mut data_buffer,
        ref mut work_item_buffers,
        ref mut current_input_buffer,
        ref mut previous_input_buffer,
-        ref mut late_indexed_indirect_parameters_buffer,
+        ref mut phase_instance_buffers,
        ref mut late_non_indexed_indirect_parameters_buffer,
    } = gpu_array_buffer.into_inner();
    data_buffer.write_buffer(&render_device);
    current_input_buffer
        .buffer
        .write_buffer(&render_device, &render_queue);
    previous_input_buffer
        .buffer
        .write_buffer(&render_device, &render_queue);
    late_indexed_indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
    late_non_indexed_indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
-    for view_work_item_buffers in work_item_buffers.values_mut() {
+    for phase_instance_buffers in phase_instance_buffers.values_mut() {
-        for phase_work_item_buffers in view_work_item_buffers.values_mut() {
+        let UntypedPhaseBatchedInstanceBuffers {
            ref mut data_buffer,
            ref mut work_item_buffers,
            ref mut late_indexed_indirect_parameters_buffer,
            ref mut late_non_indexed_indirect_parameters_buffer,
        } = *phase_instance_buffers;
        data_buffer.write_buffer(&render_device);
        late_indexed_indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
        late_non_indexed_indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
        for phase_work_item_buffers in work_item_buffers.values_mut() {
            match *phase_work_item_buffers {
                PreprocessWorkItemBuffers::Direct(ref mut buffer_vec) => {
                    buffer_vec.write_buffer(&render_device, &render_queue);
@ -1728,12 +1955,9 @@ pub fn write_batched_instance_buffers<GFBD>(
 pub fn clear_indirect_parameters_buffers(
    mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
 ) {
-    indirect_parameters_buffers.indexed_data.clear();
+    for phase_indirect_parameters_buffers in indirect_parameters_buffers.values_mut() {
-    indirect_parameters_buffers.indexed_metadata.clear();
+        phase_indirect_parameters_buffers.clear();
-    indirect_parameters_buffers.indexed_batch_sets.clear();
+    }
    indirect_parameters_buffers.non_indexed_data.clear();
    indirect_parameters_buffers.non_indexed_metadata.clear();
    indirect_parameters_buffers.non_indexed_batch_sets.clear();
 }
 pub fn write_indirect_parameters_buffers(
@ -1741,26 +1965,28 @@ pub fn write_indirect_parameters_buffers(
    render_queue: Res<RenderQueue>,
    mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
 ) {
-    indirect_parameters_buffers
+    for phase_indirect_parameters_buffers in indirect_parameters_buffers.values_mut() {
-        .indexed_data
+        phase_indirect_parameters_buffers
-        .write_buffer(&render_device);
+            .indexed_data
-    indirect_parameters_buffers
+            .write_buffer(&render_device);
-        .non_indexed_data
+        phase_indirect_parameters_buffers
-        .write_buffer(&render_device);
+            .non_indexed_data
            .write_buffer(&render_device);
-    indirect_parameters_buffers
+        phase_indirect_parameters_buffers
-        .indexed_metadata
+            .indexed_metadata
-        .write_buffer(&render_device, &render_queue);
+            .write_buffer(&render_device, &render_queue);
-    indirect_parameters_buffers
+        phase_indirect_parameters_buffers
-        .non_indexed_metadata
+            .non_indexed_metadata
-        .write_buffer(&render_device, &render_queue);
+            .write_buffer(&render_device, &render_queue);
-    indirect_parameters_buffers
+        phase_indirect_parameters_buffers
-        .indexed_batch_sets
+            .indexed_batch_sets
-        .write_buffer(&render_device, &render_queue);
+            .write_buffer(&render_device, &render_queue);
-    indirect_parameters_buffers
+        phase_indirect_parameters_buffers
-        .non_indexed_batch_sets
+            .non_indexed_batch_sets
-        .write_buffer(&render_device, &render_queue);
+            .write_buffer(&render_device, &render_queue);
    }
 }
 #[cfg(test)]
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@ -4,18 +4,15 @@ use bevy_ecs::{
    system::{ResMut, SystemParam, SystemParamItem},
 };
 use bytemuck::Pod;
 use gpu_preprocessing::UntypedPhaseIndirectParametersBuffers;
 use nonmax::NonMaxU32;
 use self::gpu_preprocessing::IndirectParametersBuffers;
 use crate::{
    render_phase::{
-        BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, SortedPhaseItem,
+        BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, InputUniformIndex,
-        SortedRenderPhase, ViewBinnedRenderPhases,
+        PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase, ViewBinnedRenderPhases,
    },
    render_resource::{CachedRenderPipelineId, GpuArrayBufferable},
 };
 use crate::{
    render_phase::{InputUniformIndex, PhaseItemExtraIndex},
    sync_world::MainEntity,
 };
@ -179,7 +176,7 @@ pub trait GetFullBatchData: GetBatchData {
        indexed: bool,
        base_output_index: u32,
        batch_set_index: Option<NonMaxU32>,
-        indirect_parameters_buffers: &mut IndirectParametersBuffers,
+        indirect_parameters_buffers: &mut UntypedPhaseIndirectParametersBuffers,
        indirect_parameters_offset: u32,
    );
 }
--- a/crates/bevy_render/src/lib.rs
+++ b/crates/bevy_render/src/lib.rs
@ -102,6 +102,7 @@ use alloc::sync::Arc;
 use bevy_app::{App, AppLabel, Plugin, SubApp};
 use bevy_asset::{load_internal_asset, weak_handle, AssetApp, AssetServer, Handle};
 use bevy_ecs::{prelude::*, schedule::ScheduleLabel};
 use bitflags::bitflags;
 use core::ops::{Deref, DerefMut};
 use std::sync::Mutex;
 use tracing::debug;
@ -120,12 +121,21 @@ pub struct RenderPlugin {
    /// If `true`, disables asynchronous pipeline compilation.
    /// This has no effect on macOS, Wasm, iOS, or without the `multi_threaded` feature.
    pub synchronous_pipeline_compilation: bool,
-    /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so
+    /// Debugging flags that can optionally be set when constructing the renderer.
-    /// that they can be read back to CPU.
+    pub debug_flags: RenderDebugFlags,
-    ///
+}
-    /// This is a debugging feature that may reduce performance. It primarily
+
-    /// exists for the `occlusion_culling` example.
+bitflags! {
-    pub allow_copies_from_indirect_parameters: bool,
+    /// Debugging flags that can optionally be set when constructing the renderer.
    #[derive(Clone, Copy, PartialEq, Default, Debug)]
    pub struct RenderDebugFlags: u8 {
        /// If true, this sets the `COPY_SRC` flag on indirect draw parameters
        /// so that they can be read back to CPU.
        ///
        /// This is a debugging feature that may reduce performance. It
        /// primarily exists for the `occlusion_culling` example.
        const ALLOW_COPIES_FROM_INDIRECT_PARAMETERS = 1;
    }
 }
 /// The systems sets of the default [`App`] rendering schedule.
@ -159,6 +169,9 @@ pub enum RenderSet {
    Prepare,
    /// A sub-set within [`Prepare`](RenderSet::Prepare) for initializing buffers, textures and uniforms for use in bind groups.
    PrepareResources,
    /// Collect phase buffers after
    /// [`PrepareResources`](RenderSet::PrepareResources) has run.
    PrepareResourcesCollectPhaseBuffers,
    /// Flush buffers after [`PrepareResources`](RenderSet::PrepareResources), but before [`PrepareBindGroups`](RenderSet::PrepareBindGroups).
    PrepareResourcesFlush,
    /// A sub-set within [`Prepare`](RenderSet::Prepare) for constructing bind groups, or other data that relies on render resources prepared in [`PrepareResources`](RenderSet::PrepareResources).
@ -210,7 +223,12 @@ impl Render {
                .after(prepare_assets::<RenderMesh>),
        );
        schedule.configure_sets(
-            (PrepareResources, PrepareResourcesFlush, PrepareBindGroups)
+            (
                PrepareResources,
                PrepareResourcesCollectPhaseBuffers,
                PrepareResourcesFlush,
                PrepareBindGroups,
            )
                .chain()
                .in_set(Prepare),
        );
@ -380,7 +398,7 @@ impl Plugin for RenderPlugin {
            GlobalsPlugin,
            MorphPlugin,
            BatchingPlugin {
-                allow_copies_from_indirect_parameters: self.allow_copies_from_indirect_parameters,
+                debug_flags: self.debug_flags,
            },
            SyncWorldPlugin,
            StoragePlugin,
--- a/crates/bevy_render/src/render_phase/mod.rs
+++ b/crates/bevy_render/src/render_phase/mod.rs
@ -43,10 +43,14 @@ use nonmax::NonMaxU32;
 pub use rangefinder::*;
 use wgpu::Features;
-use crate::batching::gpu_preprocessing::{GpuPreprocessingMode, GpuPreprocessingSupport};
+use crate::batching::gpu_preprocessing::{
    GpuPreprocessingMode, GpuPreprocessingSupport, PhaseBatchedInstanceBuffers,
    PhaseIndirectParametersBuffers,
 };
 use crate::renderer::RenderDevice;
 use crate::sync_world::{MainEntity, MainEntityHashMap};
 use crate::view::RetainedViewEntity;
 use crate::RenderDebugFlags;
 use crate::{
    batching::{
        self,
@ -1011,18 +1015,26 @@ impl UnbatchableBinnedEntityIndexSet {
 ///
 /// This is the version used when the pipeline supports GPU preprocessing: e.g.
 /// 3D PBR meshes.
-pub struct BinnedRenderPhasePlugin<BPI, GFBD>(PhantomData<(BPI, GFBD)>)
+pub struct BinnedRenderPhasePlugin<BPI, GFBD>
 where
    BPI: BinnedPhaseItem,
    GFBD: GetFullBatchData;
 impl<BPI, GFBD> Default for BinnedRenderPhasePlugin<BPI, GFBD>
 where
    BPI: BinnedPhaseItem,
    GFBD: GetFullBatchData,
 {
-    fn default() -> Self {
+    /// Debugging flags that can optionally be set when constructing the renderer.
-        Self(PhantomData)
+    pub debug_flags: RenderDebugFlags,
    phantom: PhantomData<(BPI, GFBD)>,
 }
 impl<BPI, GFBD> BinnedRenderPhasePlugin<BPI, GFBD>
 where
    BPI: BinnedPhaseItem,
    GFBD: GetFullBatchData,
 {
    pub fn new(debug_flags: RenderDebugFlags) -> Self {
        Self {
            debug_flags,
            phantom: PhantomData,
        }
    }
 }
@ -1038,6 +1050,11 @@ where
        render_app
            .init_resource::<ViewBinnedRenderPhases<BPI>>()
            .init_resource::<PhaseBatchedInstanceBuffers<BPI, GFBD::BufferData>>()
            .insert_resource(PhaseIndirectParametersBuffers::<BPI>::new(
                self.debug_flags
                    .contains(RenderDebugFlags::ALLOW_COPIES_FROM_INDIRECT_PARAMETERS),
            ))
            .add_systems(
                Render,
                (
@ -1054,6 +1071,13 @@ where
                    )
                        .in_set(RenderSet::PrepareResources),
                    sweep_old_entities::<BPI>.in_set(RenderSet::QueueSweep),
                    gpu_preprocessing::collect_buffers_for_phase::<BPI, GFBD>
                        .run_if(
                            resource_exists::<
                                BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
                            >,
                        )
                        .in_set(RenderSet::PrepareResourcesCollectPhaseBuffers),
                ),
            );
    }
@ -1097,18 +1121,26 @@ where
 ///
 /// This is the version used when the pipeline supports GPU preprocessing: e.g.
 /// 3D PBR meshes.
-pub struct SortedRenderPhasePlugin<SPI, GFBD>(PhantomData<(SPI, GFBD)>)
+pub struct SortedRenderPhasePlugin<SPI, GFBD>
 where
    SPI: SortedPhaseItem,
    GFBD: GetFullBatchData;
 impl<SPI, GFBD> Default for SortedRenderPhasePlugin<SPI, GFBD>
 where
    SPI: SortedPhaseItem,
    GFBD: GetFullBatchData,
 {
-    fn default() -> Self {
+    /// Debugging flags that can optionally be set when constructing the renderer.
-        Self(PhantomData)
+    pub debug_flags: RenderDebugFlags,
    phantom: PhantomData<(SPI, GFBD)>,
 }
 impl<SPI, GFBD> SortedRenderPhasePlugin<SPI, GFBD>
 where
    SPI: SortedPhaseItem,
    GFBD: GetFullBatchData,
 {
    pub fn new(debug_flags: RenderDebugFlags) -> Self {
        Self {
            debug_flags,
            phantom: PhantomData,
        }
    }
 }
@ -1124,18 +1156,33 @@ where
        render_app
            .init_resource::<ViewSortedRenderPhases<SPI>>()
            .init_resource::<PhaseBatchedInstanceBuffers<SPI, GFBD::BufferData>>()
            .insert_resource(PhaseIndirectParametersBuffers::<SPI>::new(
                self.debug_flags
                    .contains(RenderDebugFlags::ALLOW_COPIES_FROM_INDIRECT_PARAMETERS),
            ))
            .add_systems(
                Render,
                (
-                    no_gpu_preprocessing::batch_and_prepare_sorted_render_phase::<SPI, GFBD>
+                    (
-                        .run_if(resource_exists::<BatchedInstanceBuffer<GFBD::BufferData>>),
+                        no_gpu_preprocessing::batch_and_prepare_sorted_render_phase::<SPI, GFBD>
-                    gpu_preprocessing::batch_and_prepare_sorted_render_phase::<SPI, GFBD>.run_if(
+                            .run_if(resource_exists::<BatchedInstanceBuffer<GFBD::BufferData>>),
-                        resource_exists::<
+                        gpu_preprocessing::batch_and_prepare_sorted_render_phase::<SPI, GFBD>
-                            BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
+                            .run_if(
-                        >,
+                                resource_exists::<
-                    ),
+                                    BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
-                )
+                                >,
-                    .in_set(RenderSet::PrepareResources),
+                            ),
                    )
                        .in_set(RenderSet::PrepareResources),
                    gpu_preprocessing::collect_buffers_for_phase::<SPI, GFBD>
                        .run_if(
                            resource_exists::<
                                BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
                            >,
                        )
                        .in_set(RenderSet::PrepareResourcesCollectPhaseBuffers),
                ),
            );
    }
 }
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@ -479,7 +479,7 @@ impl GetFullBatchData for Mesh2dPipeline {
        indexed: bool,
        base_output_index: u32,
        batch_set_index: Option<NonMaxU32>,
-        indirect_parameters_buffer: &mut bevy_render::batching::gpu_preprocessing::IndirectParametersBuffers,
+        indirect_parameters_buffer: &mut bevy_render::batching::gpu_preprocessing::UntypedPhaseIndirectParametersBuffers,
        indirect_parameters_offset: u32,
    ) {
        // Note that `IndirectParameters` covers both of these structures, even
--- a/examples/3d/occlusion_culling.rs
+++ b/examples/3d/occlusion_culling.rs
@ -6,6 +6,7 @@
 //! the effects of occlusion culling can be seen.
 use std::{
    any::TypeId,
    f32::consts::PI,
    fmt::Write as _,
    result::Result,
@ -15,9 +16,13 @@ use std::{
 use bevy::{
    color::palettes::css::{SILVER, WHITE},
    core_pipeline::{
-        core_3d::graph::{Core3d, Node3d},
+        core_3d::{
            graph::{Core3d, Node3d},
            Opaque3d,
        },
        prepass::DepthPrepass,
    },
    pbr::PbrPlugin,
    prelude::*,
    render::{
        batching::gpu_preprocessing::{
@ -29,7 +34,7 @@ use bevy::{
        render_resource::{Buffer, BufferDescriptor, BufferUsages, MapMode},
        renderer::{RenderAdapter, RenderContext, RenderDevice},
        settings::WgpuFeatures,
-        Render, RenderApp, RenderPlugin, RenderSet,
+        Render, RenderApp, RenderDebugFlags, RenderPlugin, RenderSet,
    },
 };
 use bytemuck::Pod;
@ -172,6 +177,8 @@ impl Default for AppStatus {
 }
 fn main() {
    let render_debug_flags = RenderDebugFlags::ALLOW_COPIES_FROM_INDIRECT_PARAMETERS;
    App::new()
        .add_plugins(
            DefaultPlugins
@ -183,7 +190,11 @@ fn main() {
                    ..default()
                })
                .set(RenderPlugin {
-                    allow_copies_from_indirect_parameters: true,
+                    debug_flags: render_debug_flags,
                    ..default()
                })
                .set(PbrPlugin {
                    debug_flags: render_debug_flags,
                    ..default()
                }),
        )
@ -421,6 +432,14 @@ impl render_graph::Node for ReadbackIndirectParametersNode {
            return Ok(());
        };
        // Get the indirect parameters buffers corresponding to the opaque 3D
        // phase, since all our meshes are in that phase.
        let Some(phase_indirect_parameters_buffers) =
            indirect_parameters_buffers.get(&TypeId::of::<Opaque3d>())
        else {
            return Ok(());
        };
        // Grab both the buffers we're copying from and the staging buffers
        // we're copying to. Remember that we can't map the indirect parameters
        // buffers directly, so we have to copy their contents to a staging
@ -431,8 +450,8 @@ impl render_graph::Node for ReadbackIndirectParametersNode {
            Some(indirect_parameters_staging_data_buffer),
            Some(indirect_parameters_staging_batch_sets_buffer),
        ) = (
-            indirect_parameters_buffers.indexed_data_buffer(),
+            phase_indirect_parameters_buffers.indexed_data_buffer(),
-            indirect_parameters_buffers.indexed_batch_sets_buffer(),
+            phase_indirect_parameters_buffers.indexed_batch_sets_buffer(),
            indirect_parameters_mapping_buffers.data.as_ref(),
            indirect_parameters_mapping_buffers.batch_sets.as_ref(),
        )
@ -474,10 +493,16 @@ fn create_indirect_parameters_staging_buffers(
    indirect_parameters_buffers: Res<IndirectParametersBuffers>,
    render_device: Res<RenderDevice>,
 ) {
    let Some(phase_indirect_parameters_buffers) =
        indirect_parameters_buffers.get(&TypeId::of::<Opaque3d>())
    else {
        return;
    };
    // Fetch the indirect parameters buffers that we're going to copy from.
    let (Some(indexed_data_buffer), Some(indexed_batch_set_buffer)) = (
-        indirect_parameters_buffers.indexed_data_buffer(),
+        phase_indirect_parameters_buffers.indexed_data_buffer(),
-        indirect_parameters_buffers.indexed_batch_sets_buffer(),
+        phase_indirect_parameters_buffers.indexed_batch_sets_buffer(),
    ) else {
        return;
    };
--- a/examples/shader/custom_render_phase.rs
+++ b/examples/shader/custom_render_phase.rs
@ -29,6 +29,7 @@ use bevy::{
        batching::{
            gpu_preprocessing::{
                batch_and_prepare_sorted_render_phase, IndirectParametersMetadata,
                UntypedPhaseIndirectParametersBuffers,
            },
            GetBatchData, GetFullBatchData,
        },
@ -435,7 +436,7 @@ impl GetFullBatchData for StencilPipeline {
        indexed: bool,
        base_output_index: u32,
        batch_set_index: Option<NonMaxU32>,
-        indirect_parameters_buffers: &mut bevy_render::batching::gpu_preprocessing::IndirectParametersBuffers,
+        indirect_parameters_buffers: &mut UntypedPhaseIndirectParametersBuffers,
        indirect_parameters_offset: u32,
    ) {
        // Note that `IndirectParameters` covers both of these structures, even
--- a/examples/shader/specialized_mesh_pipeline.rs
+++ b/examples/shader/specialized_mesh_pipeline.rs
@ -16,12 +16,12 @@ use bevy::{
    },
    prelude::*,
    render::{
        batching::GetFullBatchData,
        batching::{
            gpu_preprocessing::{
-                self, BatchedInstanceBuffers, IndirectParametersBuffers, PreprocessWorkItem,
+                self, PhaseBatchedInstanceBuffers, PhaseIndirectParametersBuffers,
                PreprocessWorkItem, UntypedPhaseBatchedInstanceBuffers,
            },
-            GetBatchData,
+            GetBatchData, GetFullBatchData,
        },
        experimental::occlusion_culling::OcclusionCulling,
        extract_component::{ExtractComponent, ExtractComponentPlugin},
@ -291,24 +291,21 @@ fn queue_custom_mesh_pipeline(
        Res<RenderMeshInstances>,
    ),
    param: StaticSystemParam<<MeshPipeline as GetBatchData>::Param>,
-    gpu_array_buffer: ResMut<
+    mut phase_batched_instance_buffers: ResMut<
-        BatchedInstanceBuffers<
+        PhaseBatchedInstanceBuffers<Opaque3d, <MeshPipeline as GetBatchData>::BufferData>,
            <MeshPipeline as GetBatchData>::BufferData,
            <MeshPipeline as GetFullBatchData>::BufferInputData,
        >,
    >,
-    mut indirect_parameters_buffers: ResMut<IndirectParametersBuffers>,
+    mut phase_indirect_parameters_buffers: ResMut<PhaseIndirectParametersBuffers<Opaque3d>>,
    mut change_tick: Local<Tick>,
 ) {
    let system_param_item = param.into_inner();
-    let BatchedInstanceBuffers {
+    let UntypedPhaseBatchedInstanceBuffers {
        ref mut data_buffer,
        ref mut work_item_buffers,
        ref mut late_indexed_indirect_parameters_buffer,
        ref mut late_non_indexed_indirect_parameters_buffer,
        ..
-    } = gpu_array_buffer.into_inner();
+    } = phase_batched_instance_buffers.buffers;
    // Get the id for our custom draw function
    let draw_function_id = opaque_draw_functions
@ -378,7 +375,8 @@ fn queue_custom_mesh_pipeline(
            // batch set.
            if mesh_batch_set_info.is_none() {
                mesh_batch_set_info = Some(MeshBatchSetInfo {
-                    indirect_parameters_index: indirect_parameters_buffers
+                    indirect_parameters_index: phase_indirect_parameters_buffers
                        .buffers
                        .allocate(mesh.indexed(), 1),
                    is_indexed: mesh.indexed(),
                });
@ -450,7 +448,8 @@ fn queue_custom_mesh_pipeline(
        // indirect parameters buffer, so that the renderer will end up
        // enqueuing a command to draw the mesh.
        if let Some(mesh_info) = mesh_batch_set_info {
-            indirect_parameters_buffers
+            phase_indirect_parameters_buffers
                .buffers
                .add_batch_set(mesh_info.is_indexed, mesh_info.indirect_parameters_index);
        }
    }