From bc342169293187cb1e9b59bb0f6a0872f457b5d1 Mon Sep 17 00:00:00 2001 From: Patrick Walton Date: Tue, 16 Jul 2024 13:33:15 -0700 Subject: [PATCH] Pack multiple vertex and index arrays together into growable buffers. (#14257) This commit uses the [`offset-allocator`] crate to combine vertex and index arrays from different meshes into single buffers. Since the primary source of `wgpu` overhead is from validation and synchronization when switching buffers, this significantly improves Bevy's rendering performance on many scenes. This patch is a more flexible version of #13218, which also used slabs. Unlike #13218, which used slabs of a fixed size, this commit implements slabs that start small and can grow. In addition to reducing memory usage, supporting slab growth reduces the number of vertex and index buffer switches that need to happen during rendering, leading to improved performance. To prevent pathological fragmentation behavior, slabs are capped to a maximum size, and mesh arrays that are too large get their own dedicated slabs. As an additional improvement over #13218, this commit allows the application to customize all allocator heuristics. The `MeshAllocatorSettings` resource contains values that adjust the minimum and maximum slab sizes, the cutoff point at which meshes get their own dedicated slabs, and the rate at which slabs grow. Hopefully-sensible defaults have been chosen for each value. Unfortunately, WebGL 2 doesn't support the *base vertex* feature, which is necessary to pack vertex arrays from different meshes into the same buffer. `wgpu` represents this restriction as the downlevel flag `BASE_VERTEX`. This patch detects that bit and ensures that all vertex buffers get dedicated slabs on that platform. Even on WebGL 2, though, we can combine all *index* arrays into single buffers to reduce buffer changes, and we do so. The following measurements are on Bistro: Overall frame time improves from 8.74 ms to 5.53 ms (1.58x speedup): ![Screenshot 2024-07-09 163521](https://github.com/bevyengine/bevy/assets/157897/5d83c824-c0ee-434c-bbaf-218ff7212c48) Render system time improves from 6.57 ms to 3.54 ms (1.86x speedup): ![Screenshot 2024-07-09 163559](https://github.com/bevyengine/bevy/assets/157897/d94e2273-c3a0-496a-9f88-20d394129610) Opaque pass time improves from 4.64 ms to 2.33 ms (1.99x speedup): ![Screenshot 2024-07-09 163536](https://github.com/bevyengine/bevy/assets/157897/e4ef6e48-d60e-44ae-9a71-b9a731c99d9a) ## Migration Guide ### Changed * Vertex and index buffers for meshes may now be packed alongside other buffers, for performance. * `GpuMesh` has been renamed to `RenderMesh`, to reflect the fact that it no longer directly stores handles to GPU objects. * Because meshes no longer have their own vertex and index buffers, the responsibility for the buffers has moved from `GpuMesh` (now called `RenderMesh`) to the `MeshAllocator` resource. To access the vertex data for a mesh, use `MeshAllocator::mesh_vertex_slice`. To access the index data for a mesh, use `MeshAllocator::mesh_index_slice`. [`offset-allocator`]: https://github.com/pcwalton/offset-allocator --- crates/bevy_pbr/src/lightmap/mod.rs | 4 +- crates/bevy_pbr/src/material.rs | 4 +- crates/bevy_pbr/src/prepass/mod.rs | 4 +- crates/bevy_pbr/src/render/light.rs | 4 +- crates/bevy_pbr/src/render/mesh.rs | 85 +- crates/bevy_pbr/src/volumetric_fog/render.rs | 36 +- crates/bevy_render/Cargo.toml | 1 + .../src/batching/gpu_preprocessing.rs | 5 +- crates/bevy_render/src/lib.rs | 10 +- crates/bevy_render/src/mesh/allocator.rs | 1025 +++++++++++++++++ crates/bevy_render/src/mesh/mesh/mod.rs | 77 +- crates/bevy_render/src/mesh/mod.rs | 6 +- crates/bevy_render/src/render_asset.rs | 20 +- crates/bevy_sprite/src/mesh2d/material.rs | 4 +- crates/bevy_sprite/src/mesh2d/mesh.rs | 38 +- examples/2d/mesh2d_manual.rs | 4 +- examples/shader/shader_instancing.rs | 43 +- 17 files changed, 1245 insertions(+), 125 deletions(-) create mode 100644 crates/bevy_render/src/mesh/allocator.rs diff --git a/crates/bevy_pbr/src/lightmap/mod.rs b/crates/bevy_pbr/src/lightmap/mod.rs index fbb5ea2731..913f86a812 100644 --- a/crates/bevy_pbr/src/lightmap/mod.rs +++ b/crates/bevy_pbr/src/lightmap/mod.rs @@ -40,7 +40,7 @@ use bevy_ecs::{ }; use bevy_math::{uvec2, vec4, Rect, UVec2}; use bevy_reflect::{std_traits::ReflectDefault, Reflect}; -use bevy_render::mesh::GpuMesh; +use bevy_render::mesh::RenderMesh; use bevy_render::texture::GpuImage; use bevy_render::{ mesh::Mesh, render_asset::RenderAssets, render_resource::Shader, texture::Image, @@ -145,7 +145,7 @@ fn extract_lightmaps( lightmaps: Extract>, render_mesh_instances: Res, images: Res>, - meshes: Res>, + meshes: Res>, ) { // Clear out the old frame's data. render_lightmaps.render_lightmaps.clear(); diff --git a/crates/bevy_pbr/src/material.rs b/crates/bevy_pbr/src/material.rs index a1c07e7413..ba25fa30cd 100644 --- a/crates/bevy_pbr/src/material.rs +++ b/crates/bevy_pbr/src/material.rs @@ -25,7 +25,7 @@ use bevy_render::{ camera::TemporalJitter, extract_instances::{ExtractInstancesPlugin, ExtractedInstances}, extract_resource::ExtractResource, - mesh::{GpuMesh, MeshVertexBufferLayoutRef}, + mesh::{MeshVertexBufferLayoutRef, RenderMesh}, render_asset::{PrepareAssetError, RenderAsset, RenderAssetPlugin, RenderAssets}, render_phase::*, render_resource::*, @@ -537,7 +537,7 @@ pub fn queue_material_meshes( mut pipelines: ResMut>>, pipeline_cache: Res, msaa: Res, - render_meshes: Res>, + render_meshes: Res>, render_materials: Res>>, render_mesh_instances: Res, render_material_instances: Res>, diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs index 0d5757a687..80c6b31121 100644 --- a/crates/bevy_pbr/src/prepass/mod.rs +++ b/crates/bevy_pbr/src/prepass/mod.rs @@ -1,6 +1,6 @@ mod prepass_bindings; -use bevy_render::mesh::{GpuMesh, MeshVertexBufferLayoutRef}; +use bevy_render::mesh::{MeshVertexBufferLayoutRef, RenderMesh}; use bevy_render::render_resource::binding_types::uniform_buffer; use bevy_render::view::WithMesh; pub use prepass_bindings::*; @@ -680,7 +680,7 @@ pub fn queue_prepass_material_meshes( mut pipelines: ResMut>>, pipeline_cache: Res, msaa: Res, - render_meshes: Res>, + render_meshes: Res>, render_mesh_instances: Res, render_materials: Res>>, render_material_instances: Res>, diff --git a/crates/bevy_pbr/src/render/light.rs b/crates/bevy_pbr/src/render/light.rs index 66ddd2b5e0..436cbb51fa 100644 --- a/crates/bevy_pbr/src/render/light.rs +++ b/crates/bevy_pbr/src/render/light.rs @@ -7,7 +7,7 @@ use bevy_ecs::{entity::EntityHashMap, system::lifetimeless::Read}; use bevy_math::{Mat4, UVec4, Vec2, Vec3, Vec3Swizzles, Vec4, Vec4Swizzles}; use bevy_render::{ diagnostic::RecordDiagnostics, - mesh::GpuMesh, + mesh::RenderMesh, primitives::{CascadesFrusta, CubemapFrusta, Frustum, HalfSpace}, render_asset::RenderAssets, render_graph::{Node, NodeRunError, RenderGraphContext}, @@ -1162,7 +1162,7 @@ pub fn prepare_lights( pub fn queue_shadows( shadow_draw_functions: Res>, prepass_pipeline: Res>, - render_meshes: Res>, + render_meshes: Res>, render_mesh_instances: Res, render_materials: Res>>, render_material_instances: Res>, diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs index 8432cbd97d..6df3849cab 100644 --- a/crates/bevy_pbr/src/render/mesh.rs +++ b/crates/bevy_pbr/src/render/mesh.rs @@ -1,5 +1,6 @@ use std::mem; +use allocator::MeshAllocator; use bevy_asset::{load_internal_asset, AssetId}; use bevy_core_pipeline::{ core_3d::{AlphaMask3d, Opaque3d, Transmissive3d, Transparent3d, CORE_3D_DEPTH_FORMAT}, @@ -1209,7 +1210,8 @@ impl GetBatchData for MeshPipeline { type Param = ( SRes, SRes, - SRes>, + SRes>, + SRes, ); // The material bind group ID, the mesh ID, and the lightmap ID, // respectively. @@ -1218,7 +1220,7 @@ impl GetBatchData for MeshPipeline { type BufferData = MeshUniform; fn get_batch_data( - (mesh_instances, lightmaps, _): &SystemParamItem, + (mesh_instances, lightmaps, _, _): &SystemParamItem, entity: Entity, ) -> Option<(Self::BufferData, Option)> { let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else { @@ -1249,7 +1251,7 @@ impl GetFullBatchData for MeshPipeline { type BufferInputData = MeshInputUniform; fn get_index_and_compare_data( - (mesh_instances, lightmaps, _): &SystemParamItem, + (mesh_instances, lightmaps, _, _): &SystemParamItem, entity: Entity, ) -> Option<(NonMaxU32, Option)> { // This should only be called during GPU building. @@ -1275,7 +1277,7 @@ impl GetFullBatchData for MeshPipeline { } fn get_binned_batch_data( - (mesh_instances, lightmaps, _): &SystemParamItem, + (mesh_instances, lightmaps, _, _): &SystemParamItem, entity: Entity, ) -> Option { let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else { @@ -1294,7 +1296,7 @@ impl GetFullBatchData for MeshPipeline { } fn get_binned_index( - (mesh_instances, _, _): &SystemParamItem, + (mesh_instances, _, _, _): &SystemParamItem, entity: Entity, ) -> Option { // This should only be called during GPU building. @@ -1312,7 +1314,7 @@ impl GetFullBatchData for MeshPipeline { } fn get_batch_indirect_parameters_index( - (mesh_instances, _, meshes): &SystemParamItem, + (mesh_instances, _, meshes, mesh_allocator): &SystemParamItem, indirect_parameters_buffer: &mut IndirectParametersBuffer, entity: Entity, instance_index: u32, @@ -1320,6 +1322,7 @@ impl GetFullBatchData for MeshPipeline { get_batch_indirect_parameters_index( mesh_instances, meshes, + mesh_allocator, indirect_parameters_buffer, entity, instance_index, @@ -1332,7 +1335,8 @@ impl GetFullBatchData for MeshPipeline { /// parameters. fn get_batch_indirect_parameters_index( mesh_instances: &RenderMeshInstances, - meshes: &RenderAssets, + meshes: &RenderAssets, + mesh_allocator: &MeshAllocator, indirect_parameters_buffer: &mut IndirectParametersBuffer, entity: Entity, instance_index: u32, @@ -1348,24 +1352,29 @@ fn get_batch_indirect_parameters_index( let mesh_instance = mesh_instances.get(&entity)?; let mesh = meshes.get(mesh_instance.mesh_asset_id)?; + let vertex_buffer_slice = mesh_allocator.mesh_vertex_slice(&mesh_instance.mesh_asset_id)?; // Note that `IndirectParameters` covers both of these structures, even // though they actually have distinct layouts. See the comment above that // type for more information. let indirect_parameters = match mesh.buffer_info { - GpuBufferInfo::Indexed { + RenderMeshBufferInfo::Indexed { count: index_count, .. - } => IndirectParameters { - vertex_or_index_count: index_count, - instance_count: 0, - first_vertex: 0, - base_vertex_or_first_instance: 0, - first_instance: instance_index, - }, - GpuBufferInfo::NonIndexed => IndirectParameters { + } => { + let index_buffer_slice = + mesh_allocator.mesh_index_slice(&mesh_instance.mesh_asset_id)?; + IndirectParameters { + vertex_or_index_count: index_count, + instance_count: 0, + first_vertex_or_first_index: index_buffer_slice.range.start, + base_vertex_or_first_instance: vertex_buffer_slice.range.start, + first_instance: instance_index, + } + } + RenderMeshBufferInfo::NonIndexed => IndirectParameters { vertex_or_index_count: mesh.vertex_count, instance_count: 0, - first_vertex: 0, + first_vertex_or_first_index: vertex_buffer_slice.range.start, base_vertex_or_first_instance: instance_index, first_instance: instance_index, }, @@ -1945,7 +1954,7 @@ impl MeshBindGroups { self.morph_targets.clear(); self.lightmaps.clear(); } - /// Get the `BindGroup` for `GpuMesh` with given `handle_id` and lightmap + /// Get the `BindGroup` for `RenderMesh` with given `handle_id` and lightmap /// key `lightmap`. pub fn get( &self, @@ -1982,7 +1991,7 @@ impl MeshBindGroupPair { #[allow(clippy::too_many_arguments)] pub fn prepare_mesh_bind_group( - meshes: Res>, + meshes: Res>, images: Res>, mut groups: ResMut, mesh_pipeline: Res, @@ -2238,10 +2247,11 @@ impl RenderCommand

for SetMeshBindGroup { pub struct DrawMesh; impl RenderCommand

for DrawMesh { type Param = ( - SRes>, + SRes>, SRes, SRes, SRes, + SRes, Option>, ); type ViewQuery = Has; @@ -2251,7 +2261,14 @@ impl RenderCommand

for DrawMesh { item: &P, has_preprocess_bind_group: ROQueryItem, _item_query: Option<()>, - (meshes, mesh_instances, indirect_parameters_buffer, pipeline_cache, preprocess_pipelines): SystemParamItem<'w, '_, Self::Param>, + ( + meshes, + mesh_instances, + indirect_parameters_buffer, + pipeline_cache, + mesh_allocator, + preprocess_pipelines, + ): SystemParamItem<'w, '_, Self::Param>, pass: &mut TrackedRenderPass<'w>, ) -> RenderCommandResult { // If we're using GPU preprocessing, then we're dependent on that @@ -2268,6 +2285,7 @@ impl RenderCommand

for DrawMesh { let meshes = meshes.into_inner(); let mesh_instances = mesh_instances.into_inner(); let indirect_parameters_buffer = indirect_parameters_buffer.into_inner(); + let mesh_allocator = mesh_allocator.into_inner(); let Some(mesh_asset_id) = mesh_instances.mesh_asset_id(item.entity()) else { return RenderCommandResult::Failure; @@ -2275,6 +2293,9 @@ impl RenderCommand

for DrawMesh { let Some(gpu_mesh) = meshes.get(mesh_asset_id) else { return RenderCommandResult::Failure; }; + let Some(vertex_buffer_slice) = mesh_allocator.mesh_vertex_slice(&mesh_asset_id) else { + return RenderCommandResult::Failure; + }; // Calculate the indirect offset, and look up the buffer. let indirect_parameters = match item.extra_index().as_indirect_parameters_index() { @@ -2291,21 +2312,31 @@ impl RenderCommand

for DrawMesh { }, }; - pass.set_vertex_buffer(0, gpu_mesh.vertex_buffer.slice(..)); + pass.set_vertex_buffer(0, vertex_buffer_slice.buffer.slice(..)); let batch_range = item.batch_range(); // Draw either directly or indirectly, as appropriate. match &gpu_mesh.buffer_info { - GpuBufferInfo::Indexed { - buffer, + RenderMeshBufferInfo::Indexed { index_format, count, } => { - pass.set_index_buffer(buffer.slice(..), 0, *index_format); + let Some(index_buffer_slice) = mesh_allocator.mesh_index_slice(&mesh_asset_id) + else { + return RenderCommandResult::Failure; + }; + + pass.set_index_buffer(index_buffer_slice.buffer.slice(..), 0, *index_format); + match indirect_parameters { None => { - pass.draw_indexed(0..*count, 0, batch_range.clone()); + pass.draw_indexed( + index_buffer_slice.range.start + ..(index_buffer_slice.range.start + *count), + vertex_buffer_slice.range.start as i32, + batch_range.clone(), + ); } Some((indirect_parameters_offset, indirect_parameters_buffer)) => pass .draw_indexed_indirect( @@ -2314,7 +2345,7 @@ impl RenderCommand

for DrawMesh { ), } } - GpuBufferInfo::NonIndexed => match indirect_parameters { + RenderMeshBufferInfo::NonIndexed => match indirect_parameters { None => { pass.draw(0..gpu_mesh.vertex_count, batch_range.clone()); } diff --git a/crates/bevy_pbr/src/volumetric_fog/render.rs b/crates/bevy_pbr/src/volumetric_fog/render.rs index bb3a56dc03..1c2cfc788c 100644 --- a/crates/bevy_pbr/src/volumetric_fog/render.rs +++ b/crates/bevy_pbr/src/volumetric_fog/render.rs @@ -18,7 +18,9 @@ use bevy_ecs::{ }; use bevy_math::{vec4, Mat3A, Mat4, Vec3, Vec3A, Vec4, Vec4Swizzles as _}; use bevy_render::{ - mesh::{GpuBufferInfo, GpuMesh, Mesh, MeshVertexBufferLayoutRef}, + mesh::{ + allocator::MeshAllocator, Mesh, MeshVertexBufferLayoutRef, RenderMesh, RenderMeshBufferInfo, + }, render_asset::RenderAssets, render_graph::{NodeRunError, RenderGraphContext, ViewNode}, render_resource::{ @@ -329,6 +331,7 @@ impl ViewNode for VolumetricFogNode { let volumetric_lighting_uniform_buffers = world.resource::(); let image_assets = world.resource::>(); let msaa = world.resource::(); + let mesh_allocator = world.resource::(); // Fetch the uniform buffer and binding. let ( @@ -344,7 +347,7 @@ impl ViewNode for VolumetricFogNode { return Ok(()); }; - let gpu_meshes = world.resource::>(); + let render_meshes = world.resource::>(); for view_fog_volume in view_fog_volumes.iter() { // If the camera is outside the fog volume, pick the cube mesh; @@ -356,6 +359,11 @@ impl ViewNode for VolumetricFogNode { PLANE_MESH.clone() }; + let Some(vertex_buffer_slice) = mesh_allocator.mesh_vertex_slice(&mesh_handle.id()) + else { + continue; + }; + let density_image = view_fog_volume .density_texture .and_then(|density_texture| image_assets.get(density_texture)); @@ -370,7 +378,7 @@ impl ViewNode for VolumetricFogNode { // This should always succeed, but if the asset was unloaded don't // panic. - let Some(gpu_mesh) = gpu_meshes.get(&mesh_handle) else { + let Some(render_mesh) = render_meshes.get(&mesh_handle) else { return Ok(()); }; @@ -426,7 +434,7 @@ impl ViewNode for VolumetricFogNode { .command_encoder() .begin_render_pass(&render_pass_descriptor); - render_pass.set_vertex_buffer(0, *gpu_mesh.vertex_buffer.slice(..)); + render_pass.set_vertex_buffer(0, *vertex_buffer_slice.buffer.slice(..)); render_pass.set_pipeline(pipeline); render_pass.set_bind_group( 0, @@ -446,17 +454,23 @@ impl ViewNode for VolumetricFogNode { ); // Draw elements or arrays, as appropriate. - match &gpu_mesh.buffer_info { - GpuBufferInfo::Indexed { - buffer, + match &render_mesh.buffer_info { + RenderMeshBufferInfo::Indexed { index_format, count, } => { - render_pass.set_index_buffer(*buffer.slice(..), *index_format); + let Some(index_buffer_slice) = + mesh_allocator.mesh_index_slice(&mesh_handle.id()) + else { + continue; + }; + + render_pass + .set_index_buffer(*index_buffer_slice.buffer.slice(..), *index_format); render_pass.draw_indexed(0..*count, 0, 0..1); } - GpuBufferInfo::NonIndexed => { - render_pass.draw(0..gpu_mesh.vertex_count, 0..1); + RenderMeshBufferInfo::NonIndexed => { + render_pass.draw(0..render_mesh.vertex_count, 0..1); } } } @@ -584,7 +598,7 @@ pub fn prepare_volumetric_fog_pipelines( With, >, msaa: Res, - meshes: Res>, + meshes: Res>, ) { let plane_mesh = meshes.get(&PLANE_MESH).expect("Plane mesh not found!"); diff --git a/crates/bevy_render/Cargo.toml b/crates/bevy_render/Cargo.toml index 4da6baaf9d..a4554f0bc2 100644 --- a/crates/bevy_render/Cargo.toml +++ b/crates/bevy_render/Cargo.toml @@ -101,6 +101,7 @@ profiling = { version = "1", features = [ async-channel = "2.2.0" nonmax = "0.5" smallvec = { version = "1.11", features = ["const_new"] } +offset-allocator = "0.2" [target.'cfg(not(target_arch = "wasm32"))'.dependencies] # Omit the `glsl` feature in non-WebAssembly by default. diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs index 60794636b4..35ce9464df 100644 --- a/crates/bevy_render/src/batching/gpu_preprocessing.rs +++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs @@ -185,8 +185,9 @@ pub struct IndirectParameters { /// This field is in the same place in both structures. pub instance_count: u32, - /// The index of the first vertex we're to draw. - pub first_vertex: u32, + /// For `ArrayIndirectParameters`, `first_vertex`; for + /// `ElementIndirectParameters`, `first_index`. + pub first_vertex_or_first_index: u32, /// For `ArrayIndirectParameters`, `first_instance`; for /// `ElementIndirectParameters`, `base_vertex`. diff --git a/crates/bevy_render/src/lib.rs b/crates/bevy_render/src/lib.rs index 49f25530b1..d03a7a6e01 100644 --- a/crates/bevy_render/src/lib.rs +++ b/crates/bevy_render/src/lib.rs @@ -64,7 +64,7 @@ use globals::GlobalsPlugin; use render_asset::RenderAssetBytesPerFrame; use renderer::{RenderAdapter, RenderAdapterInfo, RenderDevice, RenderQueue}; -use crate::mesh::GpuMesh; +use crate::mesh::RenderMesh; use crate::renderer::WgpuWrapper; use crate::{ camera::CameraPlugin, @@ -115,7 +115,7 @@ pub enum RenderSet { /// Queue drawable entities as phase items in render phases ready for /// sorting (if necessary) Queue, - /// A sub-set within [`Queue`](RenderSet::Queue) where mesh entity queue systems are executed. Ensures `prepare_assets::` is completed. + /// A sub-set within [`Queue`](RenderSet::Queue) where mesh entity queue systems are executed. Ensures `prepare_assets::` is completed. QueueMeshes, // TODO: This could probably be moved in favor of a system ordering // abstraction in `Render` or `Queue` @@ -165,7 +165,11 @@ impl Render { ); schedule.configure_sets((ExtractCommands, PrepareAssets, Prepare).chain()); - schedule.configure_sets(QueueMeshes.in_set(Queue).after(prepare_assets::)); + schedule.configure_sets( + QueueMeshes + .in_set(Queue) + .after(prepare_assets::), + ); schedule.configure_sets( (PrepareResources, PrepareResourcesFlush, PrepareBindGroups) .chain() diff --git a/crates/bevy_render/src/mesh/allocator.rs b/crates/bevy_render/src/mesh/allocator.rs new file mode 100644 index 0000000000..218e19c475 --- /dev/null +++ b/crates/bevy_render/src/mesh/allocator.rs @@ -0,0 +1,1025 @@ +//! Manages mesh vertex and index buffers. + +use std::{ + borrow::Cow, + fmt::{self, Display, Formatter}, + iter, + ops::Range, + vec::Vec, +}; + +use bevy_app::{App, Plugin}; +use bevy_asset::AssetId; +use bevy_derive::{Deref, DerefMut}; +use bevy_ecs::{ + schedule::IntoSystemConfigs as _, + system::{Res, ResMut, Resource}, + world::{FromWorld, World}, +}; +use bevy_utils::{ + hashbrown::{HashMap, HashSet}, + tracing::error, +}; +use offset_allocator::{Allocation, Allocator}; +use wgpu::{ + util::BufferInitDescriptor, BufferDescriptor, BufferUsages, CommandEncoderDescriptor, + DownlevelFlags, COPY_BUFFER_ALIGNMENT, +}; + +use crate::{ + mesh::{Indices, Mesh, MeshVertexBufferLayouts, RenderMesh}, + render_asset::{prepare_assets, ExtractedAssets}, + render_resource::Buffer, + renderer::{RenderAdapter, RenderDevice, RenderQueue}, + Render, RenderApp, RenderSet, +}; + +/// A plugin that manages GPU memory for mesh data. +pub struct MeshAllocatorPlugin; + +/// Manages the assignment of mesh data to GPU buffers. +/// +/// The Bevy renderer tries to pack vertex and index data for multiple meshes +/// together so that multiple meshes can be drawn back-to-back without any +/// rebinding. This resource manages these buffers. +/// +/// Within each slab, or hardware buffer, the underlying allocation algorithm is +/// [`offset-allocator`], a Rust port of Sebastian Aaltonen's hard-real-time C++ +/// `OffsetAllocator`. Slabs start small and then grow as their contents fill +/// up, up to a maximum size limit. To reduce fragmentation, vertex and index +/// buffers that are too large bypass this system and receive their own buffers. +/// +/// The [`MeshAllocatorSettings`] allows you to tune the behavior of the +/// allocator for better performance with your application. Most applications +/// won't need to change the settings from their default values. +#[derive(Resource)] +pub struct MeshAllocator { + /// Holds all buffers and allocators. + slabs: HashMap, + + /// Maps a layout to the slabs that hold elements of that layout. + /// + /// This is used when allocating, so that we can find the appropriate slab + /// to place an object in. + slab_layouts: HashMap>, + + /// Maps mesh asset IDs to the ID of the slabs that hold their vertex data. + mesh_id_to_vertex_slab: HashMap, SlabId>, + + /// Maps mesh asset IDs to the ID of the slabs that hold their index data. + mesh_id_to_index_slab: HashMap, SlabId>, + + /// The next slab ID to assign. + next_slab_id: SlabId, + + /// Whether we can pack multiple vertex arrays into a single slab on this + /// platform. + /// + /// This corresponds to [`DownlevelFlags::BASE_VERTEX`], which is unset on + /// WebGL 2. On this platform, we must give each vertex array its own + /// buffer, because we can't adjust the first vertex when we perform a draw. + general_vertex_slabs_supported: bool, +} + +/// Tunable parameters that customize the behavior of the allocator. +/// +/// Generally, these parameters adjust the tradeoff between memory fragmentation +/// and performance. You can adjust them as desired for your application. Most +/// applications can stick with the default values. +#[derive(Resource)] +pub struct MeshAllocatorSettings { + /// The minimum size of a slab (hardware buffer), in bytes. + /// + /// The default value is 1 MiB. + pub min_slab_size: u64, + + /// The maximum size of a slab (hardware buffer), in bytes. + /// + /// When a slab reaches this limit, a new slab is created. + /// + /// The default value is 512 MiB. + pub max_slab_size: u64, + + /// The maximum size of vertex or index data that can be placed in a general + /// slab, in bytes. + /// + /// If a mesh has vertex or index data that exceeds this size limit, that + /// data is placed in its own slab. This reduces fragmentation, but incurs + /// more CPU-side binding overhead when drawing the mesh. + /// + /// The default value is 256 MiB. + pub large_threshold: u64, + + /// The factor by which we scale a slab when growing it. + /// + /// This value must be greater than 1. Higher values result in more + /// fragmentation but fewer expensive copy operations when growing the + /// buffer. + /// + /// The default value is 1.5. + pub growth_factor: f64, +} + +impl Default for MeshAllocatorSettings { + fn default() -> Self { + Self { + // 1 MiB + min_slab_size: 1024 * 1024, + // 512 MiB + max_slab_size: 1024 * 1024 * 512, + // 256 MiB + large_threshold: 1024 * 1024 * 256, + // 1.5× growth + growth_factor: 1.5, + } + } +} + +/// The hardware buffer that mesh data lives in, as well as the range within +/// that buffer. +pub struct MeshBufferSlice<'a> { + /// The buffer that the mesh data resides in. + pub buffer: &'a Buffer, + + /// The range of elements within this buffer that the mesh data resides in, + /// measured in elements. + /// + /// This is not a byte range; it's an element range. For vertex data, this + /// is measured in increments of a single vertex. (Thus, if a vertex is 32 + /// bytes long, then this range is in units of 32 bytes each.) For index + /// data, this is measured in increments of a single index value (2 or 4 + /// bytes). Draw commands generally take their ranges in elements, not + /// bytes, so this is the most convenient unit in this case. + pub range: Range, +} + +/// The index of a single slab. +#[derive(Clone, Copy, Default, PartialEq, Eq, Hash, Debug)] +#[repr(transparent)] +struct SlabId(u32); + +/// Data for a single slab. +#[allow(clippy::large_enum_variant)] +enum Slab { + /// A slab that can contain multiple objects. + General(GeneralSlab), + /// A slab that contains a single object. + LargeObject(LargeObjectSlab), +} + +/// A resizable slab that can contain multiple objects. +/// +/// This is the normal type of slab used for objects that are below the +/// [`MeshAllocatorSettings::large_threshold`]. Slabs are divided into *slots*, +/// which are described in detail in the [`ElementLayout`] documentation. +struct GeneralSlab { + /// The [`Allocator`] that manages the objects in this slab. + allocator: Allocator, + + /// The GPU buffer that backs this slab. + /// + /// This may be `None` if the buffer hasn't been created yet. We delay + /// creation of buffers until allocating all the meshes for a single frame, + /// so that we don't needlessly create and resize buffers when many meshes + /// load all at once. + buffer: Option, + + /// Allocations that are on the GPU. + /// + /// The range is in slots. + resident_allocations: HashMap, SlabAllocation>, + + /// Allocations that are waiting to be uploaded to the GPU. + /// + /// The range is in slots. + pending_allocations: HashMap, SlabAllocation>, + + /// The layout of a single element (vertex or index). + element_layout: ElementLayout, + + /// The size of this slab in slots. + slot_capacity: u32, +} + +/// A slab that contains a single object. +/// +/// Typically, this is for objects that exceed the +/// [`MeshAllocatorSettings::large_threshold`]. This is also for objects that +/// would ordinarily receive their own slab but can't because of platform +/// limitations, most notably vertex arrays on WebGL 2. +struct LargeObjectSlab { + /// The GPU buffer that backs this slab. + /// + /// This may be `None` if the buffer hasn't been created yet. + buffer: Option, + + /// The layout of a single element (vertex or index). + element_layout: ElementLayout, +} + +/// The type of element that a slab can store. +#[derive(Clone, Copy, PartialEq, Eq, Hash)] +enum ElementClass { + /// Data for a vertex. + Vertex, + /// A vertex index. + Index, +} + +/// Information about the size of individual elements (vertices or indices) +/// within a slab. +/// +/// Slab objects are allocated in units of *slots*. Usually, each element takes +/// up one slot, and so elements and slots are equivalent. Occasionally, +/// however, a slot may consist of 2 or even 4 elements. This occurs when the +/// size of an element isn't divisible by [`COPY_BUFFER_ALIGNMENT`]. When we +/// resize buffers, we perform GPU-to-GPU copies to shuffle the existing +/// elements into their new positions, and such copies must be on +/// [`COPY_BUFFER_ALIGNMENT`] boundaries. Slots solve this problem by +/// guaranteeing that the size of an allocation quantum is divisible by both the +/// size of an element and [`COPY_BUFFER_ALIGNMENT`], so we can relocate it +/// freely. +#[derive(Clone, Copy, PartialEq, Eq, Hash)] +struct ElementLayout { + /// Either a vertex or an index. + class: ElementClass, + + /// The size in bytes of a single element (vertex or index). + size: u64, + + /// The number of elements that make up a single slot. + /// + /// Usually, this is 1, but it can be different if [`ElementLayout::size`] + /// isn't divisible by 4. See the comment in [`ElementLayout`] for more + /// details. + elements_per_slot: u32, +} + +/// The location of an allocation and the slab it's contained in. +struct MeshAllocation { + /// The ID of the slab. + slab_id: SlabId, + /// Holds the actual allocation. + slab_allocation: SlabAllocation, +} + +/// An allocation within a slab. +#[derive(Clone)] +struct SlabAllocation { + /// The actual [`Allocator`] handle, needed to free the allocation. + allocation: Allocation, + /// The number of slots that this allocation takes up. + slot_count: u32, +} + +/// Holds information about all slabs scheduled to be allocated or reallocated. +#[derive(Default, Deref, DerefMut)] +struct SlabsToReallocate(HashMap); + +/// Holds information about a slab that's scheduled to be allocated or +/// reallocated. +#[derive(Default)] +struct SlabToReallocate { + /// Maps all allocations that need to be relocated to their positions within + /// the *new* slab. + allocations_to_copy: HashMap, SlabAllocation>, +} + +impl Display for SlabId { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + +impl Plugin for MeshAllocatorPlugin { + fn build(&self, app: &mut App) { + let Some(render_app) = app.get_sub_app_mut(RenderApp) else { + return; + }; + + render_app + .init_resource::() + .add_systems( + Render, + allocate_and_free_meshes + .in_set(RenderSet::PrepareAssets) + .before(prepare_assets::), + ); + } + + fn finish(&self, app: &mut App) { + let Some(render_app) = app.get_sub_app_mut(RenderApp) else { + return; + }; + + // The `RenderAdapter` isn't available until now, so we can't do this in + // [`Plugin::build`]. + render_app.init_resource::(); + } +} + +impl FromWorld for MeshAllocator { + fn from_world(world: &mut World) -> Self { + // Note whether we're on WebGL 2. In this case, we must give every + // vertex array its own slab. + let render_adapter = world.resource::(); + let general_vertex_slabs_supported = render_adapter + .get_downlevel_capabilities() + .flags + .contains(DownlevelFlags::BASE_VERTEX); + + Self { + slabs: HashMap::new(), + slab_layouts: HashMap::new(), + mesh_id_to_vertex_slab: HashMap::new(), + mesh_id_to_index_slab: HashMap::new(), + next_slab_id: SlabId(0), + general_vertex_slabs_supported, + } + } +} + +/// A system that processes newly-extracted or newly-removed meshes and writes +/// their data into buffers or frees their data as appropriate. +pub fn allocate_and_free_meshes( + mut mesh_allocator: ResMut, + mesh_allocator_settings: Res, + extracted_meshes: Res>, + mut mesh_vertex_buffer_layouts: ResMut, + render_device: Res, + render_queue: Res, +) { + // Process newly-added meshes. + mesh_allocator.allocate_meshes( + &mesh_allocator_settings, + &extracted_meshes, + &mut mesh_vertex_buffer_layouts, + &render_device, + &render_queue, + ); + + // Process removed meshes. + mesh_allocator.free_meshes(&extracted_meshes); +} + +impl MeshAllocator { + /// Returns the buffer and range within that buffer of the vertex data for + /// the mesh with the given ID. + /// + /// If the mesh wasn't allocated, returns None. + pub fn mesh_vertex_slice(&self, mesh_id: &AssetId) -> Option { + self.mesh_slice_in_slab(mesh_id, *self.mesh_id_to_vertex_slab.get(mesh_id)?) + } + + /// Returns the buffer and range within that buffer of the index data for + /// the mesh with the given ID. + /// + /// If the mesh has no index data or wasn't allocated, returns None. + pub fn mesh_index_slice(&self, mesh_id: &AssetId) -> Option { + self.mesh_slice_in_slab(mesh_id, *self.mesh_id_to_index_slab.get(mesh_id)?) + } + + /// Given a slab and a mesh with data located with it, returns the buffer + /// and range of that mesh data within the slab. + fn mesh_slice_in_slab( + &self, + mesh_id: &AssetId, + slab_id: SlabId, + ) -> Option { + match self.slabs.get(&slab_id)? { + Slab::General(ref general_slab) => { + let slab_allocation = general_slab.resident_allocations.get(mesh_id)?; + Some(MeshBufferSlice { + buffer: general_slab.buffer.as_ref()?, + range: (slab_allocation.allocation.offset + * general_slab.element_layout.elements_per_slot) + ..((slab_allocation.allocation.offset + slab_allocation.slot_count) + * general_slab.element_layout.elements_per_slot), + }) + } + + Slab::LargeObject(ref large_object_slab) => { + let buffer = large_object_slab.buffer.as_ref()?; + Some(MeshBufferSlice { + buffer, + range: 0..((buffer.size() / large_object_slab.element_layout.size) as u32), + }) + } + } + } + + /// Processes newly-loaded meshes, allocating room in the slabs for their + /// mesh data and performing upload operations as appropriate. + fn allocate_meshes( + &mut self, + mesh_allocator_settings: &MeshAllocatorSettings, + extracted_meshes: &ExtractedAssets, + mesh_vertex_buffer_layouts: &mut MeshVertexBufferLayouts, + render_device: &RenderDevice, + render_queue: &RenderQueue, + ) { + let mut slabs_to_grow = SlabsToReallocate::default(); + + // Allocate. + for (mesh_id, mesh) in &extracted_meshes.extracted { + // Allocate vertex data. Note that we can only pack mesh vertex data + // together if the platform supports it. + let vertex_element_layout = ElementLayout::vertex(mesh_vertex_buffer_layouts, mesh); + if self.general_vertex_slabs_supported { + self.allocate( + mesh_id, + mesh.get_vertex_buffer_data().len() as u64, + vertex_element_layout, + &mut slabs_to_grow, + mesh_allocator_settings, + ); + } else { + self.allocate_large(mesh_id, vertex_element_layout); + } + + // Allocate index data. + if let (Some(index_buffer_data), Some(index_element_layout)) = + (mesh.get_index_buffer_bytes(), ElementLayout::index(mesh)) + { + self.allocate( + mesh_id, + index_buffer_data.len() as u64, + index_element_layout, + &mut slabs_to_grow, + mesh_allocator_settings, + ); + } + } + + // Perform growth. + for (slab_id, slab_to_grow) in slabs_to_grow.0 { + self.reallocate_slab(render_device, render_queue, slab_id, slab_to_grow); + } + + // Copy new mesh data in. + for (mesh_id, mesh) in &extracted_meshes.extracted { + self.copy_mesh_vertex_data(mesh_id, mesh, render_device, render_queue); + self.copy_mesh_index_data(mesh_id, mesh, render_device, render_queue); + } + } + + /// Copies vertex array data from a mesh into the appropriate spot in the + /// slab. + fn copy_mesh_vertex_data( + &mut self, + mesh_id: &AssetId, + mesh: &Mesh, + render_device: &RenderDevice, + render_queue: &RenderQueue, + ) { + let Some(&slab_id) = self.mesh_id_to_vertex_slab.get(mesh_id) else { + return; + }; + let vertex_data = mesh.get_vertex_buffer_data(); + + // Call the generic function. + self.copy_element_data( + mesh_id, + mesh, + &vertex_data, + BufferUsages::VERTEX, + slab_id, + render_device, + render_queue, + ); + } + + /// Copies index array data from a mesh into the appropriate spot in the + /// slab. + fn copy_mesh_index_data( + &mut self, + mesh_id: &AssetId, + mesh: &Mesh, + render_device: &RenderDevice, + render_queue: &RenderQueue, + ) { + let Some(&slab_id) = self.mesh_id_to_index_slab.get(mesh_id) else { + return; + }; + let Some(index_data) = mesh.get_index_buffer_bytes() else { + return; + }; + + // Call the generic function. + self.copy_element_data( + mesh_id, + mesh, + index_data, + BufferUsages::INDEX, + slab_id, + render_device, + render_queue, + ); + } + + /// A generic function that copies either vertex or index data into a slab. + #[allow(clippy::too_many_arguments)] + fn copy_element_data( + &mut self, + mesh_id: &AssetId, + mesh: &Mesh, + data: &[u8], + buffer_usages: BufferUsages, + slab_id: SlabId, + render_device: &RenderDevice, + render_queue: &RenderQueue, + ) { + let Some(slab) = self.slabs.get_mut(&slab_id) else { + return; + }; + + match *slab { + Slab::General(ref mut general_slab) => { + let (Some(ref buffer), Some(allocated_range)) = ( + &general_slab.buffer, + general_slab.pending_allocations.remove(mesh_id), + ) else { + return; + }; + + let slot_size = general_slab.element_layout.slot_size(); + + // Write the data in. + render_queue.write_buffer( + buffer, + allocated_range.allocation.offset as u64 * slot_size, + &pad_to_alignment(data, slot_size as usize), + ); + + // Mark the allocation as resident. + general_slab + .resident_allocations + .insert(*mesh_id, allocated_range); + } + + Slab::LargeObject(ref mut large_object_slab) => { + debug_assert!(large_object_slab.buffer.is_none()); + + // Create the buffer and its data in one go. + large_object_slab.buffer = Some(render_device.create_buffer_with_data( + &BufferInitDescriptor { + label: Some(&format!( + "large mesh slab {} ({}buffer)", + slab_id, + buffer_usages_to_str(buffer_usages) + )), + contents: &mesh.get_vertex_buffer_data(), + usage: buffer_usages | BufferUsages::COPY_DST, + }, + )); + } + } + } + + fn free_meshes(&mut self, extracted_meshes: &ExtractedAssets) { + let mut empty_slabs = HashSet::new(); + for mesh_id in &extracted_meshes.removed { + if let Some(slab_id) = self.mesh_id_to_vertex_slab.remove(mesh_id) { + self.free_allocation_in_slab(mesh_id, slab_id, &mut empty_slabs); + } + if let Some(slab_id) = self.mesh_id_to_index_slab.remove(mesh_id) { + self.free_allocation_in_slab(mesh_id, slab_id, &mut empty_slabs); + } + } + + for empty_slab in empty_slabs { + self.slabs.remove(&empty_slab); + } + } + + /// Given a slab and the ID of a mesh containing data in it, marks the + /// allocation as free. + /// + /// If this results in the slab becoming empty, this function adds the slab + /// to the `empty_slabs` set. + fn free_allocation_in_slab( + &mut self, + mesh_id: &AssetId, + slab_id: SlabId, + empty_slabs: &mut HashSet, + ) { + let Some(slab) = self.slabs.get_mut(&slab_id) else { + return; + }; + + match *slab { + Slab::General(ref mut general_slab) => { + let Some(slab_allocation) = general_slab + .resident_allocations + .remove(mesh_id) + .or_else(|| general_slab.pending_allocations.remove(mesh_id)) + else { + return; + }; + + general_slab.allocator.free(slab_allocation.allocation); + + if general_slab.is_empty() { + empty_slabs.insert(slab_id); + } + } + Slab::LargeObject(_) => { + empty_slabs.insert(slab_id); + } + } + } + + /// Allocates space for mesh data with the given byte size and layout in the + /// appropriate slab, creating that slab if necessary. + fn allocate( + &mut self, + mesh_id: &AssetId, + data_byte_len: u64, + layout: ElementLayout, + slabs_to_grow: &mut SlabsToReallocate, + settings: &MeshAllocatorSettings, + ) { + let data_element_count = data_byte_len.div_ceil(layout.size) as u32; + let data_slot_count = data_element_count.div_ceil(layout.elements_per_slot); + + // If the mesh data is too large for a slab, give it a slab of its own. + if data_slot_count as u64 * layout.slot_size() + >= settings.large_threshold.min(settings.max_slab_size) + { + self.allocate_large(mesh_id, layout); + } else { + self.allocate_general(mesh_id, data_slot_count, layout, slabs_to_grow, settings); + } + } + + /// Allocates space for mesh data with the given slot size and layout in the + /// appropriate general slab. + fn allocate_general( + &mut self, + mesh_id: &AssetId, + data_slot_count: u32, + layout: ElementLayout, + slabs_to_grow: &mut SlabsToReallocate, + settings: &MeshAllocatorSettings, + ) { + let candidate_slabs = self.slab_layouts.entry(layout).or_default(); + + // Loop through the slabs that accept elements of the appropriate type + // and try to allocate the mesh inside them. We go with the first one + // that succeeds. + let mut mesh_allocation = None; + 'slab: for &slab_id in &*candidate_slabs { + loop { + let Some(Slab::General(ref mut slab)) = self.slabs.get_mut(&slab_id) else { + unreachable!("Slab not found") + }; + + if let Some(allocation) = slab.allocator.allocate(data_slot_count) { + mesh_allocation = Some(MeshAllocation { + slab_id, + slab_allocation: SlabAllocation { + allocation, + slot_count: data_slot_count, + }, + }); + break 'slab; + } + + // Try to grow the slab. If this fails, the slab is full; go on + // to the next slab. + match slab.try_grow(settings) { + Ok(new_mesh_allocation_records) => { + slabs_to_grow.insert(slab_id, new_mesh_allocation_records); + } + Err(()) => continue 'slab, + } + } + } + + // If we still have no allocation, make a new slab. + if mesh_allocation.is_none() { + let new_slab_id = self.next_slab_id; + self.next_slab_id.0 += 1; + + let new_slab = GeneralSlab::new( + new_slab_id, + &mut mesh_allocation, + settings, + layout, + data_slot_count, + ); + + self.slabs.insert(new_slab_id, Slab::General(new_slab)); + candidate_slabs.push(new_slab_id); + slabs_to_grow.insert(new_slab_id, SlabToReallocate::default()); + } + + let mesh_allocation = mesh_allocation.expect("Should have been able to allocate"); + + // Mark the allocation as pending. Don't copy it in just yet; further + // meshes loaded this frame may result in its final allocation location + // changing. + if let Some(Slab::General(ref mut general_slab)) = + self.slabs.get_mut(&mesh_allocation.slab_id) + { + general_slab + .pending_allocations + .insert(*mesh_id, mesh_allocation.slab_allocation); + }; + + self.record_allocation(mesh_id, mesh_allocation.slab_id, layout.class); + } + + /// Allocates an object into its own dedicated slab. + fn allocate_large(&mut self, mesh_id: &AssetId, layout: ElementLayout) { + let new_slab_id = self.next_slab_id; + self.next_slab_id.0 += 1; + + self.record_allocation(mesh_id, new_slab_id, layout.class); + + self.slabs.insert( + new_slab_id, + Slab::LargeObject(LargeObjectSlab { + buffer: None, + element_layout: layout, + }), + ); + } + + /// Reallocates a slab that needs to be resized, or allocates a new slab. + /// + /// This performs the actual growth operation that [`GeneralSlab::try_grow`] + /// scheduled. We do the growth in two phases so that, if a slab grows + /// multiple times in the same frame, only one new buffer is reallocated, + /// rather than reallocating the buffer multiple times. + fn reallocate_slab( + &mut self, + render_device: &RenderDevice, + render_queue: &RenderQueue, + slab_id: SlabId, + slab_to_grow: SlabToReallocate, + ) { + let Some(Slab::General(slab)) = self.slabs.get_mut(&slab_id) else { + error!("Couldn't find slab {:?} to grow", slab_id); + return; + }; + + let old_buffer = slab.buffer.take(); + + let mut buffer_usages = BufferUsages::COPY_SRC | BufferUsages::COPY_DST; + match slab.element_layout.class { + ElementClass::Vertex => buffer_usages |= BufferUsages::VERTEX, + ElementClass::Index => buffer_usages |= BufferUsages::INDEX, + }; + + // Create the buffer. + let new_buffer = render_device.create_buffer(&BufferDescriptor { + label: Some(&format!( + "general mesh slab {} ({}buffer)", + slab_id, + buffer_usages_to_str(buffer_usages) + )), + size: slab.slot_capacity as u64 * slab.element_layout.slot_size(), + usage: buffer_usages, + mapped_at_creation: false, + }); + + slab.buffer = Some(new_buffer.clone()); + + // In order to do buffer copies, we need a command encoder. + let mut encoder = render_device.create_command_encoder(&CommandEncoderDescriptor { + label: Some("slab resize encoder"), + }); + + // If we have no objects to copy over, we're done. + let Some(old_buffer) = old_buffer else { + return; + }; + + for (mesh_id, src_slab_allocation) in &mut slab.resident_allocations { + let Some(dest_slab_allocation) = slab_to_grow.allocations_to_copy.get(mesh_id) else { + continue; + }; + + encoder.copy_buffer_to_buffer( + &old_buffer, + src_slab_allocation.allocation.offset as u64 * slab.element_layout.slot_size(), + &new_buffer, + dest_slab_allocation.allocation.offset as u64 * slab.element_layout.slot_size(), + dest_slab_allocation.slot_count as u64 * slab.element_layout.slot_size(), + ); + // Now that we've done the copy, we can update the allocation record. + *src_slab_allocation = dest_slab_allocation.clone(); + } + + let command_buffer = encoder.finish(); + render_queue.submit([command_buffer]); + } + + /// Records the location of the given newly-allocated mesh data in the + /// [`Self::mesh_id_to_vertex_slab`] or [`Self::mesh_id_to_index_slab`] + /// tables as appropriate. + fn record_allocation( + &mut self, + mesh_id: &AssetId, + slab_id: SlabId, + element_class: ElementClass, + ) { + match element_class { + ElementClass::Vertex => { + self.mesh_id_to_vertex_slab.insert(*mesh_id, slab_id); + } + ElementClass::Index => { + self.mesh_id_to_index_slab.insert(*mesh_id, slab_id); + } + } + } +} + +impl GeneralSlab { + /// Creates a new growable slab big enough to hold an single element of + /// `data_slot_count` size with the given `layout`. + fn new( + new_slab_id: SlabId, + mesh_allocation: &mut Option, + settings: &MeshAllocatorSettings, + layout: ElementLayout, + data_slot_count: u32, + ) -> GeneralSlab { + let slab_slot_capacity = (settings.min_slab_size.div_ceil(layout.slot_size()) as u32) + .max(offset_allocator::ext::min_allocator_size(data_slot_count)); + + let mut new_slab = GeneralSlab { + allocator: Allocator::new(slab_slot_capacity), + buffer: None, + resident_allocations: HashMap::new(), + pending_allocations: HashMap::new(), + element_layout: layout, + slot_capacity: slab_slot_capacity, + }; + + // This should never fail. + if let Some(allocation) = new_slab.allocator.allocate(data_slot_count) { + *mesh_allocation = Some(MeshAllocation { + slab_id: new_slab_id, + slab_allocation: SlabAllocation { + slot_count: data_slot_count, + allocation, + }, + }); + } + + new_slab + } + + /// Attempts to grow a slab that's just run out of space. + /// + /// Returns a structure the allocations that need to be relocated if the + /// growth succeeded. If the slab is full, returns `Err`. + fn try_grow(&mut self, settings: &MeshAllocatorSettings) -> Result { + // In extremely rare cases due to allocator fragmentation, it may happen + // that we fail to re-insert every object that was in the slab after + // growing it. Even though this will likely never happen, we use this + // loop to handle this unlikely event properly if it does. + 'grow: loop { + let new_slab_slot_capacity = ((self.slot_capacity as f64 * settings.growth_factor) + .ceil() as u32) + .min((settings.max_slab_size / self.element_layout.slot_size()) as u32); + if new_slab_slot_capacity == self.slot_capacity { + // The slab is full. + return Err(()); + } + + // Grow the slab. + self.allocator = Allocator::new(new_slab_slot_capacity); + self.slot_capacity = new_slab_slot_capacity; + + let mut slab_to_grow = SlabToReallocate::default(); + + // Place every resident allocation that was in the old slab in the + // new slab. + for (allocated_mesh_id, old_allocation_range) in &self.resident_allocations { + let allocation_size = old_allocation_range.slot_count; + match self.allocator.allocate(allocation_size) { + Some(allocation) => { + slab_to_grow.allocations_to_copy.insert( + *allocated_mesh_id, + SlabAllocation { + allocation, + slot_count: allocation_size, + }, + ); + } + None => { + // We failed to insert one of the allocations that we + // had before. + continue 'grow; + } + } + } + + // Move every allocation that was pending in the old slab to the new + // slab. + for slab_allocation in self.pending_allocations.values_mut() { + let allocation_size = slab_allocation.slot_count; + match self.allocator.allocate(allocation_size) { + Some(allocation) => slab_allocation.allocation = allocation, + None => { + // We failed to insert one of the allocations that we + // had before. + continue 'grow; + } + } + } + + return Ok(slab_to_grow); + } + } +} + +impl ElementLayout { + /// Creates an [`ElementLayout`] for mesh data of the given class (vertex or + /// index) with the given byte size. + fn new(class: ElementClass, size: u64) -> ElementLayout { + ElementLayout { + class, + size, + // Make sure that slot boundaries begin and end on + // `COPY_BUFFER_ALIGNMENT`-byte (4-byte) boundaries. + elements_per_slot: (COPY_BUFFER_ALIGNMENT / gcd(size, COPY_BUFFER_ALIGNMENT)) as u32, + } + } + + fn slot_size(&self) -> u64 { + self.size * self.elements_per_slot as u64 + } + + /// Creates the appropriate [`ElementLayout`] for the given mesh's vertex + /// data. + fn vertex( + mesh_vertex_buffer_layouts: &mut MeshVertexBufferLayouts, + mesh: &Mesh, + ) -> ElementLayout { + let mesh_vertex_buffer_layout = + mesh.get_mesh_vertex_buffer_layout(mesh_vertex_buffer_layouts); + ElementLayout::new( + ElementClass::Vertex, + mesh_vertex_buffer_layout.0.layout().array_stride, + ) + } + + /// Creates the appropriate [`ElementLayout`] for the given mesh's index + /// data. + fn index(mesh: &Mesh) -> Option { + let size = match mesh.indices()? { + Indices::U16(_) => 2, + Indices::U32(_) => 4, + }; + Some(ElementLayout::new(ElementClass::Index, size)) + } +} + +impl GeneralSlab { + /// Returns true if this slab is empty. + fn is_empty(&self) -> bool { + self.resident_allocations.is_empty() && self.pending_allocations.is_empty() + } +} + +/// Returns the greatest common divisor of the two numbers. +/// +/// +fn gcd(mut a: u64, mut b: u64) -> u64 { + while b != 0 { + let t = b; + b = a % b; + a = t; + } + a +} + +/// Ensures that the size of a buffer is a multiple of the given alignment by +/// padding it with zeroes if necessary. +/// +/// If the buffer already has the required size, then this function doesn't +/// allocate. Otherwise, it copies the buffer into a new one and writes the +/// appropriate number of zeroes to the end. +fn pad_to_alignment(buffer: &[u8], align: usize) -> Cow<[u8]> { + if buffer.len() % align == 0 { + return Cow::Borrowed(buffer); + } + let mut buffer = buffer.to_vec(); + buffer.extend(iter::repeat(0).take(align - buffer.len() % align)); + Cow::Owned(buffer) +} + +/// Returns a string describing the given buffer usages. +fn buffer_usages_to_str(buffer_usages: BufferUsages) -> &'static str { + if buffer_usages.contains(BufferUsages::VERTEX) { + "vertex " + } else if buffer_usages.contains(BufferUsages::INDEX) { + "index " + } else { + "" + } +} diff --git a/crates/bevy_render/src/mesh/mesh/mod.rs b/crates/bevy_render/src/mesh/mesh/mod.rs index 02dd99e15a..1238f761c2 100644 --- a/crates/bevy_render/src/mesh/mesh/mod.rs +++ b/crates/bevy_render/src/mesh/mesh/mod.rs @@ -8,8 +8,7 @@ use crate::{ prelude::Image, primitives::Aabb, render_asset::{PrepareAssetError, RenderAsset, RenderAssetUsages, RenderAssets}, - render_resource::{Buffer, TextureView, VertexBufferLayout}, - renderer::RenderDevice, + render_resource::{TextureView, VertexBufferLayout}, texture::GpuImage, }; use bevy_asset::{Asset, Handle}; @@ -24,10 +23,7 @@ use bevy_utils::tracing::{error, warn}; use bytemuck::cast_slice; use std::{collections::BTreeMap, hash::Hash, iter::FusedIterator}; use thiserror::Error; -use wgpu::{ - util::BufferInitDescriptor, BufferUsages, IndexFormat, VertexAttribute, VertexFormat, - VertexStepMode, -}; +use wgpu::{IndexFormat, VertexAttribute, VertexFormat, VertexStepMode}; use super::{MeshVertexBufferLayoutRef, MeshVertexBufferLayouts}; @@ -1660,42 +1656,51 @@ impl BaseMeshPipelineKey { } } -/// The GPU-representation of a [`Mesh`]. -/// Consists of a vertex data buffer and an optional index data buffer. +/// The render world representation of a [`Mesh`]. #[derive(Debug, Clone)] -pub struct GpuMesh { - /// Contains all attribute data for each vertex. - pub vertex_buffer: Buffer, +pub struct RenderMesh { + /// The number of vertices in the mesh. pub vertex_count: u32, + + /// Morph targets for the mesh, if present. pub morph_targets: Option, - pub buffer_info: GpuBufferInfo, + + /// Information about the mesh data buffers, including whether the mesh uses + /// indices or not. + pub buffer_info: RenderMeshBufferInfo, + + /// Precomputed pipeline key bits for this mesh. pub key_bits: BaseMeshPipelineKey, + + /// A reference to the vertex buffer layout. + /// + /// Combined with [`RenderMesh::buffer_info`], this specifies the complete + /// layout of the buffers associated with this mesh. pub layout: MeshVertexBufferLayoutRef, } -impl GpuMesh { +impl RenderMesh { + /// Returns the primitive topology of this mesh (triangles, triangle strips, + /// etc.) #[inline] pub fn primitive_topology(&self) -> PrimitiveTopology { self.key_bits.primitive_topology() } } -/// The index/vertex buffer info of a [`GpuMesh`]. +/// The index/vertex buffer info of a [`RenderMesh`]. #[derive(Debug, Clone)] -pub enum GpuBufferInfo { +pub enum RenderMeshBufferInfo { Indexed { - /// Contains all index data of a mesh. - buffer: Buffer, count: u32, index_format: IndexFormat, }, NonIndexed, } -impl RenderAsset for GpuMesh { +impl RenderAsset for RenderMesh { type SourceAsset = Mesh; type Param = ( - SRes, SRes>, SResMut, ); @@ -1717,12 +1722,10 @@ impl RenderAsset for GpuMesh { Some(vertex_size * vertex_count + index_bytes) } - /// Converts the extracted mesh a into [`GpuMesh`]. + /// Converts the extracted mesh into a [`RenderMesh`]. fn prepare_asset( mesh: Self::SourceAsset, - (render_device, images, ref mut mesh_vertex_buffer_layouts): &mut SystemParamItem< - Self::Param, - >, + (images, ref mut mesh_vertex_buffer_layouts): &mut SystemParamItem, ) -> Result> { let morph_targets = match mesh.morph_targets.as_ref() { Some(mt) => { @@ -1734,25 +1737,12 @@ impl RenderAsset for GpuMesh { None => None, }; - let vertex_buffer_data = mesh.get_vertex_buffer_data(); - let vertex_buffer = render_device.create_buffer_with_data(&BufferInitDescriptor { - usage: BufferUsages::VERTEX, - label: Some("Mesh Vertex Buffer"), - contents: &vertex_buffer_data, - }); - - let buffer_info = if let Some(data) = mesh.get_index_buffer_bytes() { - GpuBufferInfo::Indexed { - buffer: render_device.create_buffer_with_data(&BufferInitDescriptor { - usage: BufferUsages::INDEX, - contents: data, - label: Some("Mesh Index Buffer"), - }), - count: mesh.indices().unwrap().len() as u32, - index_format: mesh.indices().unwrap().into(), - } - } else { - GpuBufferInfo::NonIndexed + let buffer_info = match mesh.indices() { + Some(indices) => RenderMeshBufferInfo::Indexed { + count: indices.len() as u32, + index_format: indices.into(), + }, + None => RenderMeshBufferInfo::NonIndexed, }; let mesh_vertex_buffer_layout = @@ -1764,8 +1754,7 @@ impl RenderAsset for GpuMesh { mesh.morph_targets.is_some(), ); - Ok(GpuMesh { - vertex_buffer, + Ok(RenderMesh { vertex_count: mesh.count_vertices() as u32, buffer_info, key_bits, diff --git a/crates/bevy_render/src/mesh/mod.rs b/crates/bevy_render/src/mesh/mod.rs index df33640716..84accac658 100644 --- a/crates/bevy_render/src/mesh/mod.rs +++ b/crates/bevy_render/src/mesh/mod.rs @@ -1,8 +1,11 @@ #[allow(clippy::module_inception)] mod mesh; + +pub mod allocator; pub mod morph; pub mod primitives; +use allocator::MeshAllocatorPlugin; use bevy_utils::HashSet; pub use mesh::*; pub use primitives::*; @@ -27,7 +30,8 @@ impl Plugin for MeshPlugin { .register_type::() .register_type::>() // 'Mesh' must be prepared after 'Image' as meshes rely on the morph target image being ready - .add_plugins(RenderAssetPlugin::::default()); + .add_plugins(RenderAssetPlugin::::default()) + .add_plugins(MeshAllocatorPlugin); let Some(render_app) = app.get_sub_app_mut(RenderApp) else { return; diff --git a/crates/bevy_render/src/render_asset.rs b/crates/bevy_render/src/render_asset.rs index e3a6aab5fb..5c9be59de3 100644 --- a/crates/bevy_render/src/render_asset.rs +++ b/crates/bevy_render/src/render_asset.rs @@ -114,7 +114,7 @@ impl Default for RenderAssetUsages { /// The `AFTER` generic parameter can be used to specify that `A::prepare_asset` should not be run until /// `prepare_assets::` has completed. This allows the `prepare_asset` function to depend on another /// prepared [`RenderAsset`], for example `Mesh::prepare_asset` relies on `RenderAssets::` for morph -/// targets, so the plugin is created as `RenderAssetPlugin::::default()`. +/// targets, so the plugin is created as `RenderAssetPlugin::::default()`. pub struct RenderAssetPlugin { phantom: PhantomData (A, AFTER)>, } @@ -168,9 +168,16 @@ impl RenderAssetDependency for A { /// Temporarily stores the extracted and removed assets of the current frame. #[derive(Resource)] pub struct ExtractedAssets { - extracted: Vec<(AssetId, A::SourceAsset)>, - removed: HashSet>, - added: HashSet>, + /// The assets extracted this frame. + pub extracted: Vec<(AssetId, A::SourceAsset)>, + + /// IDs of the assets removed this frame. + /// + /// These assets will not be present in [`ExtractedAssets::extracted`]. + pub removed: HashSet>, + + /// IDs of the assets added this frame. + pub added: HashSet>, } impl Default for ExtractedAssets { @@ -238,7 +245,10 @@ impl FromWorld for CachedExtractRenderAssetSystemState { /// This system extracts all created or modified assets of the corresponding [`RenderAsset::SourceAsset`] type /// into the "render world". -fn extract_render_asset(mut commands: Commands, mut main_world: ResMut) { +pub(crate) fn extract_render_asset( + mut commands: Commands, + mut main_world: ResMut, +) { main_world.resource_scope( |world, mut cached_state: Mut>| { let (mut events, mut assets) = cached_state.state.get_mut(world); diff --git a/crates/bevy_sprite/src/mesh2d/material.rs b/crates/bevy_sprite/src/mesh2d/material.rs index d61ca002bf..1ef8ca1eaa 100644 --- a/crates/bevy_sprite/src/mesh2d/material.rs +++ b/crates/bevy_sprite/src/mesh2d/material.rs @@ -12,7 +12,7 @@ use bevy_ecs::{ }; use bevy_math::FloatOrd; use bevy_render::{ - mesh::{GpuMesh, MeshVertexBufferLayoutRef}, + mesh::{MeshVertexBufferLayoutRef, RenderMesh}, render_asset::{ prepare_assets, PrepareAssetError, RenderAsset, RenderAssetPlugin, RenderAssets, }, @@ -370,7 +370,7 @@ pub fn queue_material2d_meshes( mut pipelines: ResMut>>, pipeline_cache: Res, msaa: Res, - render_meshes: Res>, + render_meshes: Res>, render_materials: Res>>, mut render_mesh_instances: ResMut, render_material_instances: Res>, diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs index 16aee3f529..b2b69c081e 100644 --- a/crates/bevy_sprite/src/mesh2d/mesh.rs +++ b/crates/bevy_sprite/src/mesh2d/mesh.rs @@ -18,12 +18,13 @@ use bevy_render::batching::no_gpu_preprocessing::{ self, batch_and_prepare_sorted_render_phase, write_batched_instance_buffer, BatchedInstanceBuffer, }; -use bevy_render::mesh::{GpuMesh, MeshVertexBufferLayoutRef}; +use bevy_render::mesh::allocator::MeshAllocator; +use bevy_render::mesh::{MeshVertexBufferLayoutRef, RenderMesh}; use bevy_render::texture::FallbackImage; use bevy_render::{ batching::{GetBatchData, NoAutomaticBatching}, globals::{GlobalsBuffer, GlobalsUniform}, - mesh::{GpuBufferInfo, Mesh}, + mesh::{Mesh, RenderMeshBufferInfo}, render_asset::RenderAssets, render_phase::{PhaseItem, RenderCommand, RenderCommandResult, TrackedRenderPass}, render_resource::{binding_types::uniform_buffer, *}, @@ -694,7 +695,11 @@ impl RenderCommand

for SetMesh2dBindGroup { pub struct DrawMesh2d; impl RenderCommand

for DrawMesh2d { - type Param = (SRes>, SRes); + type Param = ( + SRes>, + SRes, + SRes, + ); type ViewQuery = (); type ItemQuery = (); @@ -703,11 +708,12 @@ impl RenderCommand

for DrawMesh2d { item: &P, _view: (), _item_query: Option<()>, - (meshes, render_mesh2d_instances): SystemParamItem<'w, '_, Self::Param>, + (meshes, render_mesh2d_instances, mesh_allocator): SystemParamItem<'w, '_, Self::Param>, pass: &mut TrackedRenderPass<'w>, ) -> RenderCommandResult { let meshes = meshes.into_inner(); let render_mesh2d_instances = render_mesh2d_instances.into_inner(); + let mesh_allocator = mesh_allocator.into_inner(); let Some(RenderMesh2dInstance { mesh_asset_id, .. }) = render_mesh2d_instances.get(&item.entity()) @@ -717,20 +723,32 @@ impl RenderCommand

for DrawMesh2d { let Some(gpu_mesh) = meshes.get(*mesh_asset_id) else { return RenderCommandResult::Failure; }; + let Some(vertex_buffer_slice) = mesh_allocator.mesh_vertex_slice(mesh_asset_id) else { + return RenderCommandResult::Failure; + }; - pass.set_vertex_buffer(0, gpu_mesh.vertex_buffer.slice(..)); + pass.set_vertex_buffer(0, vertex_buffer_slice.buffer.slice(..)); let batch_range = item.batch_range(); match &gpu_mesh.buffer_info { - GpuBufferInfo::Indexed { - buffer, + RenderMeshBufferInfo::Indexed { index_format, count, } => { - pass.set_index_buffer(buffer.slice(..), 0, *index_format); - pass.draw_indexed(0..*count, 0, batch_range.clone()); + let Some(index_buffer_slice) = mesh_allocator.mesh_index_slice(mesh_asset_id) + else { + return RenderCommandResult::Failure; + }; + + pass.set_index_buffer(index_buffer_slice.buffer.slice(..), 0, *index_format); + + pass.draw_indexed( + index_buffer_slice.range.start..(index_buffer_slice.range.start + count), + vertex_buffer_slice.range.start as i32, + batch_range.clone(), + ); } - GpuBufferInfo::NonIndexed => { + RenderMeshBufferInfo::NonIndexed => { pass.draw(0..gpu_mesh.vertex_count, batch_range.clone()); } } diff --git a/examples/2d/mesh2d_manual.rs b/examples/2d/mesh2d_manual.rs index 75b018c368..01195bc39e 100644 --- a/examples/2d/mesh2d_manual.rs +++ b/examples/2d/mesh2d_manual.rs @@ -11,7 +11,7 @@ use bevy::{ math::FloatOrd, prelude::*, render::{ - mesh::{GpuMesh, Indices, MeshVertexAttribute}, + mesh::{Indices, MeshVertexAttribute, RenderMesh}, render_asset::{RenderAssetUsages, RenderAssets}, render_phase::{ AddRenderCommand, DrawFunctions, PhaseItemExtraIndex, SetItemPipeline, @@ -352,7 +352,7 @@ pub fn queue_colored_mesh2d( mut pipelines: ResMut>, pipeline_cache: Res, msaa: Res, - render_meshes: Res>, + render_meshes: Res>, render_mesh_instances: Res, mut transparent_render_phases: ResMut>, mut views: Query<(Entity, &VisibleEntities, &ExtractedView)>, diff --git a/examples/shader/shader_instancing.rs b/examples/shader/shader_instancing.rs index 63b0c17908..9ae798b2a7 100644 --- a/examples/shader/shader_instancing.rs +++ b/examples/shader/shader_instancing.rs @@ -12,7 +12,9 @@ use bevy::{ prelude::*, render::{ extract_component::{ExtractComponent, ExtractComponentPlugin}, - mesh::{GpuBufferInfo, GpuMesh, MeshVertexBufferLayoutRef}, + mesh::{ + allocator::MeshAllocator, MeshVertexBufferLayoutRef, RenderMesh, RenderMeshBufferInfo, + }, render_asset::RenderAssets, render_phase::{ AddRenderCommand, DrawFunctions, PhaseItem, PhaseItemExtraIndex, RenderCommand, @@ -117,7 +119,7 @@ fn queue_custom( msaa: Res, mut pipelines: ResMut>, pipeline_cache: Res, - meshes: Res>, + meshes: Res>, render_mesh_instances: Res, material_meshes: Query>, mut transparent_render_phases: ResMut>, @@ -241,7 +243,11 @@ type DrawCustom = ( struct DrawMeshInstanced; impl RenderCommand

for DrawMeshInstanced { - type Param = (SRes>, SRes); + type Param = ( + SRes>, + SRes, + SRes, + ); type ViewQuery = (); type ItemQuery = Read; @@ -250,9 +256,12 @@ impl RenderCommand

for DrawMeshInstanced { item: &P, _view: (), instance_buffer: Option<&'w InstanceBuffer>, - (meshes, render_mesh_instances): SystemParamItem<'w, '_, Self::Param>, + (meshes, render_mesh_instances, mesh_allocator): SystemParamItem<'w, '_, Self::Param>, pass: &mut TrackedRenderPass<'w>, ) -> RenderCommandResult { + // A borrow check workaround. + let mesh_allocator = mesh_allocator.into_inner(); + let Some(mesh_instance) = render_mesh_instances.render_mesh_queue_data(item.entity()) else { return RenderCommandResult::Failure; @@ -263,20 +272,34 @@ impl RenderCommand

for DrawMeshInstanced { let Some(instance_buffer) = instance_buffer else { return RenderCommandResult::Failure; }; + let Some(vertex_buffer_slice) = + mesh_allocator.mesh_vertex_slice(&mesh_instance.mesh_asset_id) + else { + return RenderCommandResult::Failure; + }; - pass.set_vertex_buffer(0, gpu_mesh.vertex_buffer.slice(..)); + pass.set_vertex_buffer(0, vertex_buffer_slice.buffer.slice(..)); pass.set_vertex_buffer(1, instance_buffer.buffer.slice(..)); match &gpu_mesh.buffer_info { - GpuBufferInfo::Indexed { - buffer, + RenderMeshBufferInfo::Indexed { index_format, count, } => { - pass.set_index_buffer(buffer.slice(..), 0, *index_format); - pass.draw_indexed(0..*count, 0, 0..instance_buffer.length as u32); + let Some(index_buffer_slice) = + mesh_allocator.mesh_index_slice(&mesh_instance.mesh_asset_id) + else { + return RenderCommandResult::Failure; + }; + + pass.set_index_buffer(index_buffer_slice.buffer.slice(..), 0, *index_format); + pass.draw_indexed( + index_buffer_slice.range.start..(index_buffer_slice.range.start + count), + vertex_buffer_slice.range.start as i32, + 0..instance_buffer.length as u32, + ); } - GpuBufferInfo::NonIndexed => { + RenderMeshBufferInfo::NonIndexed => { pass.draw(0..gpu_mesh.vertex_count, 0..instance_buffer.length as u32); } }