//! Batching functionality when GPU preprocessing is in use. use bevy_app::{App, Plugin}; use bevy_derive::{Deref, DerefMut}; use bevy_ecs::{ entity::Entity, query::{Has, With}, schedule::IntoSystemConfigs as _, system::{Query, Res, ResMut, Resource, StaticSystemParam}, world::{FromWorld, World}, }; use bevy_encase_derive::ShaderType; use bevy_utils::EntityHashMap; use bytemuck::{Pod, Zeroable}; use nonmax::NonMaxU32; use smallvec::smallvec; use wgpu::{BindingResource, BufferUsages, DownlevelFlags, Features}; use crate::{ render_phase::{ BinnedPhaseItem, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem, PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase, UnbatchableBinnedEntityIndices, ViewBinnedRenderPhases, ViewSortedRenderPhases, }, render_resource::{BufferVec, GpuArrayBufferable, RawBufferVec, UninitBufferVec}, renderer::{RenderAdapter, RenderDevice, RenderQueue}, view::{GpuCulling, ViewTarget}, Render, RenderApp, RenderSet, }; use super::{BatchMeta, GetBatchData, GetFullBatchData}; pub struct BatchingPlugin; impl Plugin for BatchingPlugin { fn build(&self, app: &mut App) { let Some(render_app) = app.get_sub_app_mut(RenderApp) else { return; }; render_app .insert_resource(IndirectParametersBuffer::new()) .add_systems( Render, write_indirect_parameters_buffer.in_set(RenderSet::PrepareResourcesFlush), ); } fn finish(&self, app: &mut App) { let Some(render_app) = app.get_sub_app_mut(RenderApp) else { return; }; render_app.init_resource::(); } } /// Records whether GPU preprocessing and/or GPU culling are supported on the /// device. /// /// No GPU preprocessing is supported on WebGL because of the lack of compute /// shader support. GPU preprocessing is supported on DirectX 12, but due to [a /// `wgpu` limitation] GPU culling is not. /// /// [a `wgpu` limitation]: https://github.com/gfx-rs/wgpu/issues/2471 #[derive(Clone, Copy, PartialEq, Resource)] pub enum GpuPreprocessingSupport { /// No GPU preprocessing support is available at all. None, /// GPU preprocessing is available, but GPU culling isn't. PreprocessingOnly, /// Both GPU preprocessing and GPU culling are available. Culling, } /// The GPU buffers holding the data needed to render batches. /// /// For example, in the 3D PBR pipeline this holds `MeshUniform`s, which are the /// `BD` type parameter in that mode. /// /// We have a separate *buffer data input* type (`BDI`) here, which a compute /// shader is expected to expand to the full buffer data (`BD`) type. GPU /// uniform building is generally faster and uses less system RAM to VRAM bus /// bandwidth, but only implemented for some pipelines (for example, not in the /// 2D pipeline at present) and only when compute shader is available. #[derive(Resource)] pub struct BatchedInstanceBuffers where BD: GpuArrayBufferable + Sync + Send + 'static, BDI: Pod, { /// A storage area for the buffer data that the GPU compute shader is /// expected to write to. /// /// There will be one entry for each index. pub data_buffer: UninitBufferVec, /// The index of the buffer data in the current input buffer that /// corresponds to each instance. /// /// This is keyed off each view. Each view has a separate buffer. pub work_item_buffers: EntityHashMap, /// The uniform data inputs for the current frame. /// /// These are uploaded during the extraction phase. pub current_input_buffer: RawBufferVec, /// The uniform data inputs for the previous frame. /// /// The indices don't generally line up between `current_input_buffer` /// and `previous_input_buffer`, because, among other reasons, entities /// can spawn or despawn between frames. Instead, each current buffer /// data input uniform is expected to contain the index of the /// corresponding buffer data input uniform in this list. pub previous_input_buffer: RawBufferVec, } /// The buffer of GPU preprocessing work items for a single view. pub struct PreprocessWorkItemBuffer { /// The buffer of work items. pub buffer: BufferVec, /// True if we're using GPU culling. pub gpu_culling: bool, } /// One invocation of the preprocessing shader: i.e. one mesh instance in a /// view. #[derive(Clone, Copy, Pod, Zeroable, ShaderType)] #[repr(C)] pub struct PreprocessWorkItem { /// The index of the batch input data in the input buffer that the shader /// reads from. pub input_index: u32, /// In direct mode, this is the index of the `MeshUniform` in the output /// buffer that we write to. In indirect mode, this is the index of the /// [`IndirectParameters`]. pub output_index: u32, } /// The `wgpu` indirect parameters structure. /// /// This is actually a union of the two following structures: /// /// ``` /// #[repr(C)] /// struct ArrayIndirectParameters { /// vertex_count: u32, /// instance_count: u32, /// first_vertex: u32, /// first_instance: u32, /// } /// /// #[repr(C)] /// struct ElementIndirectParameters { /// index_count: u32, /// instance_count: u32, /// first_vertex: u32, /// base_vertex: u32, /// first_instance: u32, /// } /// ``` /// /// We actually generally treat these two variants identically in code. To do /// that, we make the following two observations: /// /// 1. `instance_count` is in the same place in both structures. So we can /// access it regardless of the structure we're looking at. /// /// 2. The second structure is one word larger than the first. Thus we need to /// pad out the first structure by one word in order to place both structures in /// an array. If we pad out `ArrayIndirectParameters` by copying the /// `first_instance` field into the padding, then the resulting union structure /// will always have a read-only copy of `first_instance` in the final word. We /// take advantage of this in the shader to reduce branching. #[derive(Clone, Copy, Pod, Zeroable, ShaderType)] #[repr(C)] pub struct IndirectParameters { /// For `ArrayIndirectParameters`, `vertex_count`; for /// `ElementIndirectParameters`, `index_count`. pub vertex_or_index_count: u32, /// The number of instances we're going to draw. /// /// This field is in the same place in both structures. pub instance_count: u32, /// The index of the first vertex we're to draw. pub first_vertex: u32, /// For `ArrayIndirectParameters`, `first_instance`; for /// `ElementIndirectParameters`, `base_vertex`. pub base_vertex_or_first_instance: u32, /// For `ArrayIndirectParameters`, this is padding; for /// `ElementIndirectParameters`, this is `first_instance`. /// /// Conventionally, we copy `first_instance` into this field when padding /// out `ArrayIndirectParameters`. That way, shader code can read this value /// at the same place, regardless of the specific structure this represents. pub first_instance: u32, } /// The buffer containing the list of [`IndirectParameters`], for draw commands. #[derive(Resource, Deref, DerefMut)] pub struct IndirectParametersBuffer(pub BufferVec); impl IndirectParametersBuffer { /// Creates the indirect parameters buffer. pub fn new() -> IndirectParametersBuffer { IndirectParametersBuffer(BufferVec::new( BufferUsages::STORAGE | BufferUsages::INDIRECT, )) } } impl Default for IndirectParametersBuffer { fn default() -> Self { Self::new() } } impl FromWorld for GpuPreprocessingSupport { fn from_world(world: &mut World) -> Self { let adapter = world.resource::(); let device = world.resource::(); if device.limits().max_compute_workgroup_size_x == 0 || // filter lower end / older devices on Android as they crash when using GPU preprocessing (cfg!(target_os = "android") && adapter.get_info().name.starts_with("Adreno (TM) 6")) { GpuPreprocessingSupport::None } else if !device .features() .contains(Features::INDIRECT_FIRST_INSTANCE) || !adapter.get_downlevel_capabilities().flags.contains( DownlevelFlags::VERTEX_AND_INSTANCE_INDEX_RESPECTS_RESPECTIVE_FIRST_VALUE_IN_INDIRECT_DRAW) { GpuPreprocessingSupport::PreprocessingOnly } else { GpuPreprocessingSupport::Culling } } } impl BatchedInstanceBuffers where BD: GpuArrayBufferable + Sync + Send + 'static, BDI: Pod, { /// Creates new buffers. pub fn new() -> Self { BatchedInstanceBuffers { data_buffer: UninitBufferVec::new(BufferUsages::STORAGE), work_item_buffers: EntityHashMap::default(), current_input_buffer: RawBufferVec::new(BufferUsages::STORAGE), previous_input_buffer: RawBufferVec::new(BufferUsages::STORAGE), } } /// Returns the binding of the buffer that contains the per-instance data. /// /// This buffer needs to be filled in via a compute shader. pub fn instance_data_binding(&self) -> Option { self.data_buffer .buffer() .map(|buffer| buffer.as_entire_binding()) } /// Clears out the buffers in preparation for a new frame. pub fn clear(&mut self) { self.data_buffer.clear(); self.current_input_buffer.clear(); self.previous_input_buffer.clear(); for work_item_buffer in self.work_item_buffers.values_mut() { work_item_buffer.buffer.clear(); } } } impl Default for BatchedInstanceBuffers where BD: GpuArrayBufferable + Sync + Send + 'static, BDI: Pod, { fn default() -> Self { Self::new() } } /// Information about a render batch that we're building up during a sorted /// render phase. struct SortedRenderBatch where F: GetBatchData, { /// The index of the first phase item in this batch in the list of phase /// items. phase_item_start_index: u32, /// The index of the first instance in this batch in the instance buffer. instance_start_index: u32, /// The index of the indirect parameters for this batch in the /// [`IndirectParametersBuffer`]. /// /// If CPU culling is being used, then this will be `None`. indirect_parameters_index: Option, /// Metadata that can be used to determine whether an instance can be placed /// into this batch. /// /// If `None`, the item inside is unbatchable. meta: Option>, } impl SortedRenderBatch where F: GetBatchData, { /// Finalizes this batch and updates the [`SortedRenderPhase`] with the /// appropriate indices. /// /// `instance_end_index` is the index of the last instance in this batch /// plus one. fn flush(self, instance_end_index: u32, phase: &mut SortedRenderPhase) where I: CachedRenderPipelinePhaseItem + SortedPhaseItem, { let (batch_range, batch_extra_index) = phase.items[self.phase_item_start_index as usize].batch_range_and_extra_index_mut(); *batch_range = self.instance_start_index..instance_end_index; *batch_extra_index = PhaseItemExtraIndex::maybe_indirect_parameters_index(self.indirect_parameters_index); } } /// A system that runs early in extraction and clears out all the /// [`BatchedInstanceBuffers`] for the frame. /// /// We have to run this during extraction because, if GPU preprocessing is in /// use, the extraction phase will write to the mesh input uniform buffers /// directly, so the buffers need to be cleared before then. pub fn clear_batched_gpu_instance_buffers( gpu_batched_instance_buffers: Option< ResMut>, >, ) where GFBD: GetFullBatchData, { if let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers { gpu_batched_instance_buffers.clear(); } } /// A system that removes GPU preprocessing work item buffers that correspond to /// deleted [`ViewTarget`]s. /// /// This is a separate system from [`clear_batched_gpu_instance_buffers`] /// because [`ViewTarget`]s aren't created until after the extraction phase is /// completed. pub fn delete_old_work_item_buffers( mut gpu_batched_instance_buffers: ResMut< BatchedInstanceBuffers, >, view_targets: Query>, ) where GFBD: GetFullBatchData, { gpu_batched_instance_buffers .work_item_buffers .retain(|entity, _| view_targets.contains(*entity)); } /// Batch the items in a sorted render phase, when GPU instance buffer building /// is in use. This means comparing metadata needed to draw each phase item and /// trying to combine the draws into a batch. pub fn batch_and_prepare_sorted_render_phase( gpu_array_buffer: ResMut>, mut indirect_parameters_buffer: ResMut, mut sorted_render_phases: ResMut>, mut views: Query<(Entity, Has)>, system_param_item: StaticSystemParam, ) where I: CachedRenderPipelinePhaseItem + SortedPhaseItem, GFBD: GetFullBatchData, { // We only process GPU-built batch data in this function. let BatchedInstanceBuffers { ref mut data_buffer, ref mut work_item_buffers, .. } = gpu_array_buffer.into_inner(); for (view, gpu_culling) in &mut views { let Some(phase) = sorted_render_phases.get_mut(&view) else { continue; }; // Create the work item buffer if necessary. let work_item_buffer = work_item_buffers .entry(view) .or_insert_with(|| PreprocessWorkItemBuffer { buffer: BufferVec::new(BufferUsages::STORAGE), gpu_culling, }); // Walk through the list of phase items, building up batches as we go. let mut batch: Option> = None; for current_index in 0..phase.items.len() { // Get the index of the input data, and comparison metadata, for // this entity. let current_batch_input_index = GFBD::get_index_and_compare_data( &system_param_item, phase.items[current_index].entity(), ); // Unpack that index and metadata. Note that it's possible for index // and/or metadata to not be present, which signifies that this // entity is unbatchable. In that case, we break the batch here. let (mut current_input_index, mut current_meta) = (None, None); if let Some((input_index, maybe_meta)) = current_batch_input_index { current_input_index = Some(input_index); current_meta = maybe_meta.map(|meta| BatchMeta::new(&phase.items[current_index], meta)); } // Determine if this entity can be included in the batch we're // building up. let can_batch = batch.as_ref().is_some_and(|batch| { // `None` for metadata indicates that the items are unbatchable. match (¤t_meta, &batch.meta) { (Some(current_meta), Some(batch_meta)) => current_meta == batch_meta, (_, _) => false, } }); // Make space in the data buffer for this instance. let current_entity = phase.items[current_index].entity(); let output_index = data_buffer.add() as u32; // If we can't batch, break the existing batch and make a new one. if !can_batch { // Break a batch if we need to. if let Some(batch) = batch.take() { batch.flush(output_index, phase); } // Start a new batch. let indirect_parameters_index = if gpu_culling { GFBD::get_batch_indirect_parameters_index( &system_param_item, &mut indirect_parameters_buffer, current_entity, output_index, ) } else { None }; batch = Some(SortedRenderBatch { phase_item_start_index: current_index as u32, instance_start_index: output_index, indirect_parameters_index, meta: current_meta, }); } // Add a new preprocessing work item so that the preprocessing // shader will copy the per-instance data over. if let (Some(batch), Some(input_index)) = (batch.as_ref(), current_input_index.as_ref()) { work_item_buffer.buffer.push(PreprocessWorkItem { input_index: (*input_index).into(), output_index: match batch.indirect_parameters_index { Some(indirect_parameters_index) => indirect_parameters_index.into(), None => output_index, }, }); } } // Flush the final batch if necessary. if let Some(batch) = batch.take() { batch.flush(data_buffer.len() as u32, phase); } } } /// Creates batches for a render phase that uses bins. pub fn batch_and_prepare_binned_render_phase( gpu_array_buffer: ResMut>, mut indirect_parameters_buffer: ResMut, mut binned_render_phases: ResMut>, mut views: Query<(Entity, Has)>, param: StaticSystemParam, ) where BPI: BinnedPhaseItem, GFBD: GetFullBatchData, { let system_param_item = param.into_inner(); let BatchedInstanceBuffers { ref mut data_buffer, ref mut work_item_buffers, .. } = gpu_array_buffer.into_inner(); for (view, gpu_culling) in &mut views { let Some(phase) = binned_render_phases.get_mut(&view) else { continue; }; // Create the work item buffer if necessary; otherwise, just mark it as // used this frame. let work_item_buffer = work_item_buffers .entry(view) .or_insert_with(|| PreprocessWorkItemBuffer { buffer: BufferVec::new(BufferUsages::STORAGE), gpu_culling, }); // Prepare batchables. for key in &phase.batchable_keys { let mut batch: Option = None; for &entity in &phase.batchable_values[key] { let Some(input_index) = GFBD::get_binned_index(&system_param_item, entity) else { continue; }; let output_index = data_buffer.add() as u32; match batch { Some(ref mut batch) => { batch.instance_range.end = output_index + 1; work_item_buffer.buffer.push(PreprocessWorkItem { input_index: input_index.into(), output_index: batch .extra_index .as_indirect_parameters_index() .unwrap_or(output_index), }); } None if gpu_culling => { let indirect_parameters_index = GFBD::get_batch_indirect_parameters_index( &system_param_item, &mut indirect_parameters_buffer, entity, output_index, ); work_item_buffer.buffer.push(PreprocessWorkItem { input_index: input_index.into(), output_index: indirect_parameters_index.unwrap_or_default().into(), }); batch = Some(BinnedRenderPhaseBatch { representative_entity: entity, instance_range: output_index..output_index + 1, extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index( indirect_parameters_index, ), }); } None => { work_item_buffer.buffer.push(PreprocessWorkItem { input_index: input_index.into(), output_index, }); batch = Some(BinnedRenderPhaseBatch { representative_entity: entity, instance_range: output_index..output_index + 1, extra_index: PhaseItemExtraIndex::NONE, }); } } } if let Some(batch) = batch { phase.batch_sets.push(smallvec![batch]); } } // Prepare unbatchables. for key in &phase.unbatchable_keys { let unbatchables = phase.unbatchable_values.get_mut(key).unwrap(); for &entity in &unbatchables.entities { let Some(input_index) = GFBD::get_binned_index(&system_param_item, entity) else { continue; }; let output_index = data_buffer.add() as u32; if gpu_culling { let indirect_parameters_index = GFBD::get_batch_indirect_parameters_index( &system_param_item, &mut indirect_parameters_buffer, entity, output_index, ) .unwrap_or_default(); work_item_buffer.buffer.push(PreprocessWorkItem { input_index: input_index.into(), output_index: indirect_parameters_index.into(), }); unbatchables .buffer_indices .add(UnbatchableBinnedEntityIndices { instance_index: indirect_parameters_index.into(), extra_index: PhaseItemExtraIndex::indirect_parameters_index( indirect_parameters_index.into(), ), }); } else { work_item_buffer.buffer.push(PreprocessWorkItem { input_index: input_index.into(), output_index, }); unbatchables .buffer_indices .add(UnbatchableBinnedEntityIndices { instance_index: output_index, extra_index: PhaseItemExtraIndex::NONE, }); } } } } } /// A system that writes all instance buffers to the GPU. pub fn write_batched_instance_buffers( render_device: Res, render_queue: Res, gpu_array_buffer: ResMut>, ) where GFBD: GetFullBatchData, { let BatchedInstanceBuffers { ref mut data_buffer, work_item_buffers: ref mut index_buffers, ref mut current_input_buffer, previous_input_buffer: _, } = gpu_array_buffer.into_inner(); data_buffer.write_buffer(&render_device); current_input_buffer.write_buffer(&render_device, &render_queue); // There's no need to write `previous_input_buffer`, as we wrote // that on the previous frame, and it hasn't changed. for index_buffer in index_buffers.values_mut() { index_buffer .buffer .write_buffer(&render_device, &render_queue); } } pub fn write_indirect_parameters_buffer( render_device: Res, render_queue: Res, mut indirect_parameters_buffer: ResMut, ) { indirect_parameters_buffer.write_buffer(&render_device, &render_queue); indirect_parameters_buffer.clear(); }