bevy/crates/bevy_render/src/batching/gpu_preprocessing.rs
François Mockers 19d078c609
don't crash without features bevy_pbr, ktx2, zstd (#14020)
# Objective

- Fixes #13728 

## Solution

- add a new feature `smaa_luts`. if enables, it also enables `ktx2` and
`zstd`. if not, it doesn't load the files but use placeholders instead
- adds all the resources needed in the same places that system that uses
them are added.
2024-06-26 03:08:23 +00:00

666 lines
25 KiB
Rust

//! Batching functionality when GPU preprocessing is in use.
use bevy_app::{App, Plugin};
use bevy_derive::{Deref, DerefMut};
use bevy_ecs::{
entity::Entity,
query::{Has, With},
schedule::IntoSystemConfigs as _,
system::{Query, Res, ResMut, Resource, StaticSystemParam},
world::{FromWorld, World},
};
use bevy_encase_derive::ShaderType;
use bevy_utils::EntityHashMap;
use bytemuck::{Pod, Zeroable};
use nonmax::NonMaxU32;
use smallvec::smallvec;
use wgpu::{BindingResource, BufferUsages, DownlevelFlags, Features};
use crate::{
render_phase::{
BinnedPhaseItem, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem,
PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase, UnbatchableBinnedEntityIndices,
ViewBinnedRenderPhases, ViewSortedRenderPhases,
},
render_resource::{BufferVec, GpuArrayBufferable, RawBufferVec, UninitBufferVec},
renderer::{RenderAdapter, RenderDevice, RenderQueue},
view::{GpuCulling, ViewTarget},
Render, RenderApp, RenderSet,
};
use super::{BatchMeta, GetBatchData, GetFullBatchData};
pub struct BatchingPlugin;
impl Plugin for BatchingPlugin {
fn build(&self, app: &mut App) {
let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
return;
};
render_app
.insert_resource(IndirectParametersBuffer::new())
.add_systems(
Render,
write_indirect_parameters_buffer.in_set(RenderSet::PrepareResourcesFlush),
);
}
fn finish(&self, app: &mut App) {
let Some(render_app) = app.get_sub_app_mut(RenderApp) else {
return;
};
render_app.init_resource::<GpuPreprocessingSupport>();
}
}
/// Records whether GPU preprocessing and/or GPU culling are supported on the
/// device.
///
/// No GPU preprocessing is supported on WebGL because of the lack of compute
/// shader support. GPU preprocessing is supported on DirectX 12, but due to [a
/// `wgpu` limitation] GPU culling is not.
///
/// [a `wgpu` limitation]: https://github.com/gfx-rs/wgpu/issues/2471
#[derive(Clone, Copy, PartialEq, Resource)]
pub enum GpuPreprocessingSupport {
/// No GPU preprocessing support is available at all.
None,
/// GPU preprocessing is available, but GPU culling isn't.
PreprocessingOnly,
/// Both GPU preprocessing and GPU culling are available.
Culling,
}
/// The GPU buffers holding the data needed to render batches.
///
/// For example, in the 3D PBR pipeline this holds `MeshUniform`s, which are the
/// `BD` type parameter in that mode.
///
/// We have a separate *buffer data input* type (`BDI`) here, which a compute
/// shader is expected to expand to the full buffer data (`BD`) type. GPU
/// uniform building is generally faster and uses less system RAM to VRAM bus
/// bandwidth, but only implemented for some pipelines (for example, not in the
/// 2D pipeline at present) and only when compute shader is available.
#[derive(Resource)]
pub struct BatchedInstanceBuffers<BD, BDI>
where
BD: GpuArrayBufferable + Sync + Send + 'static,
BDI: Pod,
{
/// A storage area for the buffer data that the GPU compute shader is
/// expected to write to.
///
/// There will be one entry for each index.
pub data_buffer: UninitBufferVec<BD>,
/// The index of the buffer data in the current input buffer that
/// corresponds to each instance.
///
/// This is keyed off each view. Each view has a separate buffer.
pub work_item_buffers: EntityHashMap<Entity, PreprocessWorkItemBuffer>,
/// The uniform data inputs for the current frame.
///
/// These are uploaded during the extraction phase.
pub current_input_buffer: RawBufferVec<BDI>,
/// The uniform data inputs for the previous frame.
///
/// The indices don't generally line up between `current_input_buffer`
/// and `previous_input_buffer`, because, among other reasons, entities
/// can spawn or despawn between frames. Instead, each current buffer
/// data input uniform is expected to contain the index of the
/// corresponding buffer data input uniform in this list.
pub previous_input_buffer: RawBufferVec<BDI>,
}
/// The buffer of GPU preprocessing work items for a single view.
pub struct PreprocessWorkItemBuffer {
/// The buffer of work items.
pub buffer: BufferVec<PreprocessWorkItem>,
/// True if we're using GPU culling.
pub gpu_culling: bool,
}
/// One invocation of the preprocessing shader: i.e. one mesh instance in a
/// view.
#[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
#[repr(C)]
pub struct PreprocessWorkItem {
/// The index of the batch input data in the input buffer that the shader
/// reads from.
pub input_index: u32,
/// In direct mode, this is the index of the `MeshUniform` in the output
/// buffer that we write to. In indirect mode, this is the index of the
/// [`IndirectParameters`].
pub output_index: u32,
}
/// The `wgpu` indirect parameters structure.
///
/// This is actually a union of the two following structures:
///
/// ```
/// #[repr(C)]
/// struct ArrayIndirectParameters {
/// vertex_count: u32,
/// instance_count: u32,
/// first_vertex: u32,
/// first_instance: u32,
/// }
///
/// #[repr(C)]
/// struct ElementIndirectParameters {
/// index_count: u32,
/// instance_count: u32,
/// first_vertex: u32,
/// base_vertex: u32,
/// first_instance: u32,
/// }
/// ```
///
/// We actually generally treat these two variants identically in code. To do
/// that, we make the following two observations:
///
/// 1. `instance_count` is in the same place in both structures. So we can
/// access it regardless of the structure we're looking at.
///
/// 2. The second structure is one word larger than the first. Thus we need to
/// pad out the first structure by one word in order to place both structures in
/// an array. If we pad out `ArrayIndirectParameters` by copying the
/// `first_instance` field into the padding, then the resulting union structure
/// will always have a read-only copy of `first_instance` in the final word. We
/// take advantage of this in the shader to reduce branching.
#[derive(Clone, Copy, Pod, Zeroable, ShaderType)]
#[repr(C)]
pub struct IndirectParameters {
/// For `ArrayIndirectParameters`, `vertex_count`; for
/// `ElementIndirectParameters`, `index_count`.
pub vertex_or_index_count: u32,
/// The number of instances we're going to draw.
///
/// This field is in the same place in both structures.
pub instance_count: u32,
/// The index of the first vertex we're to draw.
pub first_vertex: u32,
/// For `ArrayIndirectParameters`, `first_instance`; for
/// `ElementIndirectParameters`, `base_vertex`.
pub base_vertex_or_first_instance: u32,
/// For `ArrayIndirectParameters`, this is padding; for
/// `ElementIndirectParameters`, this is `first_instance`.
///
/// Conventionally, we copy `first_instance` into this field when padding
/// out `ArrayIndirectParameters`. That way, shader code can read this value
/// at the same place, regardless of the specific structure this represents.
pub first_instance: u32,
}
/// The buffer containing the list of [`IndirectParameters`], for draw commands.
#[derive(Resource, Deref, DerefMut)]
pub struct IndirectParametersBuffer(pub BufferVec<IndirectParameters>);
impl IndirectParametersBuffer {
/// Creates the indirect parameters buffer.
pub fn new() -> IndirectParametersBuffer {
IndirectParametersBuffer(BufferVec::new(
BufferUsages::STORAGE | BufferUsages::INDIRECT,
))
}
}
impl Default for IndirectParametersBuffer {
fn default() -> Self {
Self::new()
}
}
impl FromWorld for GpuPreprocessingSupport {
fn from_world(world: &mut World) -> Self {
let adapter = world.resource::<RenderAdapter>();
let device = world.resource::<RenderDevice>();
if device.limits().max_compute_workgroup_size_x == 0 ||
// filter lower end / older devices on Android as they crash when using GPU preprocessing
(cfg!(target_os = "android") && adapter.get_info().name.starts_with("Adreno (TM) 6"))
{
GpuPreprocessingSupport::None
} else if !device
.features()
.contains(Features::INDIRECT_FIRST_INSTANCE) ||
!adapter.get_downlevel_capabilities().flags.contains(
DownlevelFlags::VERTEX_AND_INSTANCE_INDEX_RESPECTS_RESPECTIVE_FIRST_VALUE_IN_INDIRECT_DRAW)
{
GpuPreprocessingSupport::PreprocessingOnly
} else {
GpuPreprocessingSupport::Culling
}
}
}
impl<BD, BDI> BatchedInstanceBuffers<BD, BDI>
where
BD: GpuArrayBufferable + Sync + Send + 'static,
BDI: Pod,
{
/// Creates new buffers.
pub fn new() -> Self {
BatchedInstanceBuffers {
data_buffer: UninitBufferVec::new(BufferUsages::STORAGE),
work_item_buffers: EntityHashMap::default(),
current_input_buffer: RawBufferVec::new(BufferUsages::STORAGE),
previous_input_buffer: RawBufferVec::new(BufferUsages::STORAGE),
}
}
/// Returns the binding of the buffer that contains the per-instance data.
///
/// This buffer needs to be filled in via a compute shader.
pub fn instance_data_binding(&self) -> Option<BindingResource> {
self.data_buffer
.buffer()
.map(|buffer| buffer.as_entire_binding())
}
/// Clears out the buffers in preparation for a new frame.
pub fn clear(&mut self) {
self.data_buffer.clear();
self.current_input_buffer.clear();
self.previous_input_buffer.clear();
for work_item_buffer in self.work_item_buffers.values_mut() {
work_item_buffer.buffer.clear();
}
}
}
impl<BD, BDI> Default for BatchedInstanceBuffers<BD, BDI>
where
BD: GpuArrayBufferable + Sync + Send + 'static,
BDI: Pod,
{
fn default() -> Self {
Self::new()
}
}
/// Information about a render batch that we're building up during a sorted
/// render phase.
struct SortedRenderBatch<F>
where
F: GetBatchData,
{
/// The index of the first phase item in this batch in the list of phase
/// items.
phase_item_start_index: u32,
/// The index of the first instance in this batch in the instance buffer.
instance_start_index: u32,
/// The index of the indirect parameters for this batch in the
/// [`IndirectParametersBuffer`].
///
/// If CPU culling is being used, then this will be `None`.
indirect_parameters_index: Option<NonMaxU32>,
/// Metadata that can be used to determine whether an instance can be placed
/// into this batch.
///
/// If `None`, the item inside is unbatchable.
meta: Option<BatchMeta<F::CompareData>>,
}
impl<F> SortedRenderBatch<F>
where
F: GetBatchData,
{
/// Finalizes this batch and updates the [`SortedRenderPhase`] with the
/// appropriate indices.
///
/// `instance_end_index` is the index of the last instance in this batch
/// plus one.
fn flush<I>(self, instance_end_index: u32, phase: &mut SortedRenderPhase<I>)
where
I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
{
let (batch_range, batch_extra_index) =
phase.items[self.phase_item_start_index as usize].batch_range_and_extra_index_mut();
*batch_range = self.instance_start_index..instance_end_index;
*batch_extra_index =
PhaseItemExtraIndex::maybe_indirect_parameters_index(self.indirect_parameters_index);
}
}
/// A system that runs early in extraction and clears out all the
/// [`BatchedInstanceBuffers`] for the frame.
///
/// We have to run this during extraction because, if GPU preprocessing is in
/// use, the extraction phase will write to the mesh input uniform buffers
/// directly, so the buffers need to be cleared before then.
pub fn clear_batched_gpu_instance_buffers<GFBD>(
gpu_batched_instance_buffers: Option<
ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
>,
) where
GFBD: GetFullBatchData,
{
if let Some(mut gpu_batched_instance_buffers) = gpu_batched_instance_buffers {
gpu_batched_instance_buffers.clear();
}
}
/// A system that removes GPU preprocessing work item buffers that correspond to
/// deleted [`ViewTarget`]s.
///
/// This is a separate system from [`clear_batched_gpu_instance_buffers`]
/// because [`ViewTarget`]s aren't created until after the extraction phase is
/// completed.
pub fn delete_old_work_item_buffers<GFBD>(
mut gpu_batched_instance_buffers: ResMut<
BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>,
>,
view_targets: Query<Entity, With<ViewTarget>>,
) where
GFBD: GetFullBatchData,
{
gpu_batched_instance_buffers
.work_item_buffers
.retain(|entity, _| view_targets.contains(*entity));
}
/// Batch the items in a sorted render phase, when GPU instance buffer building
/// is in use. This means comparing metadata needed to draw each phase item and
/// trying to combine the draws into a batch.
pub fn batch_and_prepare_sorted_render_phase<I, GFBD>(
gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
mut indirect_parameters_buffer: ResMut<IndirectParametersBuffer>,
mut sorted_render_phases: ResMut<ViewSortedRenderPhases<I>>,
mut views: Query<(Entity, Has<GpuCulling>)>,
system_param_item: StaticSystemParam<GFBD::Param>,
) where
I: CachedRenderPipelinePhaseItem + SortedPhaseItem,
GFBD: GetFullBatchData,
{
// We only process GPU-built batch data in this function.
let BatchedInstanceBuffers {
ref mut data_buffer,
ref mut work_item_buffers,
..
} = gpu_array_buffer.into_inner();
for (view, gpu_culling) in &mut views {
let Some(phase) = sorted_render_phases.get_mut(&view) else {
continue;
};
// Create the work item buffer if necessary.
let work_item_buffer =
work_item_buffers
.entry(view)
.or_insert_with(|| PreprocessWorkItemBuffer {
buffer: BufferVec::new(BufferUsages::STORAGE),
gpu_culling,
});
// Walk through the list of phase items, building up batches as we go.
let mut batch: Option<SortedRenderBatch<GFBD>> = None;
for current_index in 0..phase.items.len() {
// Get the index of the input data, and comparison metadata, for
// this entity.
let current_batch_input_index = GFBD::get_index_and_compare_data(
&system_param_item,
phase.items[current_index].entity(),
);
// Unpack that index and metadata. Note that it's possible for index
// and/or metadata to not be present, which signifies that this
// entity is unbatchable. In that case, we break the batch here.
let (mut current_input_index, mut current_meta) = (None, None);
if let Some((input_index, maybe_meta)) = current_batch_input_index {
current_input_index = Some(input_index);
current_meta =
maybe_meta.map(|meta| BatchMeta::new(&phase.items[current_index], meta));
}
// Determine if this entity can be included in the batch we're
// building up.
let can_batch = batch.as_ref().is_some_and(|batch| {
// `None` for metadata indicates that the items are unbatchable.
match (&current_meta, &batch.meta) {
(Some(current_meta), Some(batch_meta)) => current_meta == batch_meta,
(_, _) => false,
}
});
// Make space in the data buffer for this instance.
let current_entity = phase.items[current_index].entity();
let output_index = data_buffer.add() as u32;
// If we can't batch, break the existing batch and make a new one.
if !can_batch {
// Break a batch if we need to.
if let Some(batch) = batch.take() {
batch.flush(output_index, phase);
}
// Start a new batch.
let indirect_parameters_index = if gpu_culling {
GFBD::get_batch_indirect_parameters_index(
&system_param_item,
&mut indirect_parameters_buffer,
current_entity,
output_index,
)
} else {
None
};
batch = Some(SortedRenderBatch {
phase_item_start_index: current_index as u32,
instance_start_index: output_index,
indirect_parameters_index,
meta: current_meta,
});
}
// Add a new preprocessing work item so that the preprocessing
// shader will copy the per-instance data over.
if let (Some(batch), Some(input_index)) = (batch.as_ref(), current_input_index.as_ref())
{
work_item_buffer.buffer.push(PreprocessWorkItem {
input_index: (*input_index).into(),
output_index: match batch.indirect_parameters_index {
Some(indirect_parameters_index) => indirect_parameters_index.into(),
None => output_index,
},
});
}
}
// Flush the final batch if necessary.
if let Some(batch) = batch.take() {
batch.flush(data_buffer.len() as u32, phase);
}
}
}
/// Creates batches for a render phase that uses bins.
pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
mut indirect_parameters_buffer: ResMut<IndirectParametersBuffer>,
mut binned_render_phases: ResMut<ViewBinnedRenderPhases<BPI>>,
mut views: Query<(Entity, Has<GpuCulling>)>,
param: StaticSystemParam<GFBD::Param>,
) where
BPI: BinnedPhaseItem,
GFBD: GetFullBatchData,
{
let system_param_item = param.into_inner();
let BatchedInstanceBuffers {
ref mut data_buffer,
ref mut work_item_buffers,
..
} = gpu_array_buffer.into_inner();
for (view, gpu_culling) in &mut views {
let Some(phase) = binned_render_phases.get_mut(&view) else {
continue;
};
// Create the work item buffer if necessary; otherwise, just mark it as
// used this frame.
let work_item_buffer =
work_item_buffers
.entry(view)
.or_insert_with(|| PreprocessWorkItemBuffer {
buffer: BufferVec::new(BufferUsages::STORAGE),
gpu_culling,
});
// Prepare batchables.
for key in &phase.batchable_keys {
let mut batch: Option<BinnedRenderPhaseBatch> = None;
for &entity in &phase.batchable_values[key] {
let Some(input_index) = GFBD::get_binned_index(&system_param_item, entity) else {
continue;
};
let output_index = data_buffer.add() as u32;
match batch {
Some(ref mut batch) => {
batch.instance_range.end = output_index + 1;
work_item_buffer.buffer.push(PreprocessWorkItem {
input_index: input_index.into(),
output_index: batch
.extra_index
.as_indirect_parameters_index()
.unwrap_or(output_index),
});
}
None if gpu_culling => {
let indirect_parameters_index = GFBD::get_batch_indirect_parameters_index(
&system_param_item,
&mut indirect_parameters_buffer,
entity,
output_index,
);
work_item_buffer.buffer.push(PreprocessWorkItem {
input_index: input_index.into(),
output_index: indirect_parameters_index.unwrap_or_default().into(),
});
batch = Some(BinnedRenderPhaseBatch {
representative_entity: entity,
instance_range: output_index..output_index + 1,
extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index(
indirect_parameters_index,
),
});
}
None => {
work_item_buffer.buffer.push(PreprocessWorkItem {
input_index: input_index.into(),
output_index,
});
batch = Some(BinnedRenderPhaseBatch {
representative_entity: entity,
instance_range: output_index..output_index + 1,
extra_index: PhaseItemExtraIndex::NONE,
});
}
}
}
if let Some(batch) = batch {
phase.batch_sets.push(smallvec![batch]);
}
}
// Prepare unbatchables.
for key in &phase.unbatchable_keys {
let unbatchables = phase.unbatchable_values.get_mut(key).unwrap();
for &entity in &unbatchables.entities {
let Some(input_index) = GFBD::get_binned_index(&system_param_item, entity) else {
continue;
};
let output_index = data_buffer.add() as u32;
if gpu_culling {
let indirect_parameters_index = GFBD::get_batch_indirect_parameters_index(
&system_param_item,
&mut indirect_parameters_buffer,
entity,
output_index,
)
.unwrap_or_default();
work_item_buffer.buffer.push(PreprocessWorkItem {
input_index: input_index.into(),
output_index: indirect_parameters_index.into(),
});
unbatchables
.buffer_indices
.add(UnbatchableBinnedEntityIndices {
instance_index: indirect_parameters_index.into(),
extra_index: PhaseItemExtraIndex::indirect_parameters_index(
indirect_parameters_index.into(),
),
});
} else {
work_item_buffer.buffer.push(PreprocessWorkItem {
input_index: input_index.into(),
output_index,
});
unbatchables
.buffer_indices
.add(UnbatchableBinnedEntityIndices {
instance_index: output_index,
extra_index: PhaseItemExtraIndex::NONE,
});
}
}
}
}
}
/// A system that writes all instance buffers to the GPU.
pub fn write_batched_instance_buffers<GFBD>(
render_device: Res<RenderDevice>,
render_queue: Res<RenderQueue>,
gpu_array_buffer: ResMut<BatchedInstanceBuffers<GFBD::BufferData, GFBD::BufferInputData>>,
) where
GFBD: GetFullBatchData,
{
let BatchedInstanceBuffers {
ref mut data_buffer,
work_item_buffers: ref mut index_buffers,
ref mut current_input_buffer,
previous_input_buffer: _,
} = gpu_array_buffer.into_inner();
data_buffer.write_buffer(&render_device);
current_input_buffer.write_buffer(&render_device, &render_queue);
// There's no need to write `previous_input_buffer`, as we wrote
// that on the previous frame, and it hasn't changed.
for index_buffer in index_buffers.values_mut() {
index_buffer
.buffer
.write_buffer(&render_device, &render_queue);
}
}
pub fn write_indirect_parameters_buffer(
render_device: Res<RenderDevice>,
render_queue: Res<RenderQueue>,
mut indirect_parameters_buffer: ResMut<IndirectParametersBuffer>,
) {
indirect_parameters_buffer.write_buffer(&render_device, &render_queue);
indirect_parameters_buffer.clear();
}