diff --git a/Cargo.toml b/Cargo.toml index 502f73a17e..7383747e7f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4132,3 +4132,14 @@ name = "Clustered Decals" description = "Demonstrates clustered decals" category = "3D Rendering" wasm = false + +[[example]] +name = "occlusion_culling" +path = "examples/3d/occlusion_culling.rs" +doc-scrape-examples = true + +[package.metadata.example.occlusion_culling] +name = "Occlusion Culling" +description = "Demonstration of Occlusion Culling" +category = "3D Rendering" +wasm = false diff --git a/crates/bevy_core_pipeline/Cargo.toml b/crates/bevy_core_pipeline/Cargo.toml index 643ae1bf18..3994040369 100644 --- a/crates/bevy_core_pipeline/Cargo.toml +++ b/crates/bevy_core_pipeline/Cargo.toml @@ -47,6 +47,7 @@ nonmax = "0.5" smallvec = "1" thiserror = { version = "2", default-features = false } tracing = { version = "0.1", default-features = false, features = ["std"] } +bytemuck = { version = "1" } [lints] workspace = true diff --git a/crates/bevy_core_pipeline/src/core_2d/mod.rs b/crates/bevy_core_pipeline/src/core_2d/mod.rs index 60b245c417..f820a245ce 100644 --- a/crates/bevy_core_pipeline/src/core_2d/mod.rs +++ b/crates/bevy_core_pipeline/src/core_2d/mod.rs @@ -312,6 +312,8 @@ impl PhaseItem for AlphaMask2d { } impl BinnedPhaseItem for AlphaMask2d { + // Since 2D meshes presently can't be multidrawn, the batch set key is + // irrelevant. type BatchSetKey = BatchSetKey2d; type BinKey = AlphaMask2dBinKey; diff --git a/crates/bevy_core_pipeline/src/core_3d/mod.rs b/crates/bevy_core_pipeline/src/core_3d/mod.rs index 19b691429e..0d6df5c887 100644 --- a/crates/bevy_core_pipeline/src/core_3d/mod.rs +++ b/crates/bevy_core_pipeline/src/core_3d/mod.rs @@ -16,7 +16,9 @@ pub mod graph { #[derive(Debug, Hash, PartialEq, Eq, Clone, RenderLabel)] pub enum Node3d { MsaaWriteback, - Prepass, + EarlyPrepass, + EarlyDownsampleDepth, + LatePrepass, DeferredPrepass, CopyDeferredLightingId, EndPrepasses, @@ -25,6 +27,7 @@ pub mod graph { MainTransmissivePass, MainTransparentPass, EndMainPass, + LateDownsampleDepth, Taa, MotionBlur, Bloom, @@ -67,9 +70,10 @@ use core::ops::Range; use bevy_render::{ batching::gpu_preprocessing::{GpuPreprocessingMode, GpuPreprocessingSupport}, + experimental::occlusion_culling::OcclusionCulling, mesh::allocator::SlabId, render_phase::PhaseItemBatchSetKey, - view::{NoIndirectDrawing, RetainedViewEntity}, + view::{prepare_view_targets, NoIndirectDrawing, RetainedViewEntity}, }; pub use camera_3d::*; pub use main_opaque_pass_3d_node::*; @@ -114,8 +118,9 @@ use crate::{ }, dof::DepthOfFieldNode, prepass::{ - node::PrepassNode, AlphaMask3dPrepass, DeferredPrepass, DepthPrepass, MotionVectorPrepass, - NormalPrepass, Opaque3dPrepass, OpaqueNoLightmap3dBatchSetKey, OpaqueNoLightmap3dBinKey, + node::{EarlyPrepassNode, LatePrepassNode}, + AlphaMask3dPrepass, DeferredPrepass, DepthPrepass, MotionVectorPrepass, NormalPrepass, + Opaque3dPrepass, OpaqueNoLightmap3dBatchSetKey, OpaqueNoLightmap3dBinKey, ViewPrepassTextures, MOTION_VECTOR_PREPASS_FORMAT, NORMAL_PREPASS_FORMAT, }, skybox::SkyboxPlugin, @@ -161,6 +166,9 @@ impl Plugin for Core3dPlugin { ( sort_phase_system::.in_set(RenderSet::PhaseSort), sort_phase_system::.in_set(RenderSet::PhaseSort), + configure_occlusion_culling_view_targets + .after(prepare_view_targets) + .in_set(RenderSet::ManageViews), prepare_core_3d_depth_textures.in_set(RenderSet::PrepareResources), prepare_core_3d_transmission_textures.in_set(RenderSet::PrepareResources), prepare_prepass_textures.in_set(RenderSet::PrepareResources), @@ -169,7 +177,8 @@ impl Plugin for Core3dPlugin { render_app .add_render_sub_graph(Core3d) - .add_render_graph_node::>(Core3d, Node3d::Prepass) + .add_render_graph_node::>(Core3d, Node3d::EarlyPrepass) + .add_render_graph_node::>(Core3d, Node3d::LatePrepass) .add_render_graph_node::>( Core3d, Node3d::DeferredPrepass, @@ -200,7 +209,8 @@ impl Plugin for Core3dPlugin { .add_render_graph_edges( Core3d, ( - Node3d::Prepass, + Node3d::EarlyPrepass, + Node3d::LatePrepass, Node3d::DeferredPrepass, Node3d::CopyDeferredLightingId, Node3d::EndPrepasses, @@ -898,6 +908,28 @@ pub fn prepare_core_3d_transmission_textures( } } +/// Sets the `TEXTURE_BINDING` flag on the depth texture if necessary for +/// occlusion culling. +/// +/// We need that flag to be set in order to read from the texture. +fn configure_occlusion_culling_view_targets( + mut view_targets: Query< + &mut Camera3d, + ( + With, + Without, + With, + Without, + ), + >, +) { + for mut camera_3d in &mut view_targets { + let mut depth_texture_usages = TextureUsages::from(camera_3d.depth_texture_usages); + depth_texture_usages |= TextureUsages::TEXTURE_BINDING; + camera_3d.depth_texture_usages = depth_texture_usages.into(); + } +} + // Disable MSAA and warn if using deferred rendering pub fn check_msaa(mut deferred_views: Query<&mut Msaa, (With, With)>) { for mut msaa in deferred_views.iter_mut() { diff --git a/crates/bevy_pbr/src/meshlet/downsample_depth.wgsl b/crates/bevy_core_pipeline/src/experimental/mip_generation/downsample_depth.wgsl similarity index 92% rename from crates/bevy_pbr/src/meshlet/downsample_depth.wgsl rename to crates/bevy_core_pipeline/src/experimental/mip_generation/downsample_depth.wgsl index 80dd7d4baa..d24afa390d 100644 --- a/crates/bevy_pbr/src/meshlet/downsample_depth.wgsl +++ b/crates/bevy_core_pipeline/src/experimental/mip_generation/downsample_depth.wgsl @@ -1,8 +1,16 @@ #ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT @group(0) @binding(0) var mip_0: array; // Per pixel #else +#ifdef MESHLET @group(0) @binding(0) var mip_0: array; // Per pixel -#endif +#else // MESHLET +#ifdef MULTISAMPLE +@group(0) @binding(0) var mip_0: texture_depth_multisampled_2d; +#else // MULTISAMPLE +@group(0) @binding(0) var mip_0: texture_depth_2d; +#endif // MULTISAMPLE +#endif // MESHLET +#endif // MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT @group(0) @binding(1) var mip_1: texture_storage_2d; @group(0) @binding(2) var mip_2: texture_storage_2d; @group(0) @binding(3) var mip_3: texture_storage_2d; @@ -304,9 +312,25 @@ fn load_mip_0(x: u32, y: u32) -> f32 { let i = y * constants.view_width + x; #ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT return bitcast(u32(mip_0[i] >> 32u)); -#else +#else // MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT +#ifdef MESHLET return bitcast(mip_0[i]); -#endif +#else // MESHLET + // Downsample the top level. +#ifdef MULTISAMPLE + // The top level is multisampled, so we need to loop over all the samples + // and reduce them to 1. + var result = textureLoad(mip_0, vec2(x, y), 0); + let sample_count = i32(textureNumSamples(mip_0)); + for (var sample = 1; sample < sample_count; sample += 1) { + result = min(result, textureLoad(mip_0, vec2(x, y), sample)); + } + return result; +#else // MULTISAMPLE + return textureLoad(mip_0, vec2(x, y), 0); +#endif // MULTISAMPLE +#endif // MESHLET +#endif // MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT } fn reduce_4(v: vec4f) -> f32 { diff --git a/crates/bevy_core_pipeline/src/experimental/mip_generation/mod.rs b/crates/bevy_core_pipeline/src/experimental/mip_generation/mod.rs new file mode 100644 index 0000000000..3140408886 --- /dev/null +++ b/crates/bevy_core_pipeline/src/experimental/mip_generation/mod.rs @@ -0,0 +1,708 @@ +//! Downsampling of textures to produce mipmap levels. +//! +//! Currently, this module only supports generation of hierarchical Z buffers +//! for occlusion culling. It's marked experimental because the shader is +//! designed only for power-of-two texture sizes and is slightly incorrect for +//! non-power-of-two depth buffer sizes. + +use core::array; + +use bevy_app::{App, Plugin}; +use bevy_asset::{load_internal_asset, Handle}; +use bevy_derive::{Deref, DerefMut}; +use bevy_ecs::{ + component::Component, + entity::Entity, + prelude::{resource_exists, Without}, + query::{QueryItem, With}, + resource::Resource, + schedule::IntoSystemConfigs as _, + system::{lifetimeless::Read, Commands, Local, Query, Res, ResMut}, + world::{FromWorld, World}, +}; +use bevy_math::{uvec2, UVec2, Vec4Swizzles as _}; +use bevy_render::{ + experimental::occlusion_culling::OcclusionCulling, + render_graph::{NodeRunError, RenderGraphApp, RenderGraphContext, ViewNode, ViewNodeRunner}, + render_resource::{ + binding_types::{sampler, texture_2d, texture_2d_multisampled, texture_storage_2d}, + BindGroup, BindGroupEntries, BindGroupLayout, BindGroupLayoutEntries, + CachedComputePipelineId, ComputePassDescriptor, ComputePipeline, ComputePipelineDescriptor, + DownlevelFlags, Extent3d, IntoBinding, PipelineCache, PushConstantRange, Sampler, + SamplerBindingType, SamplerDescriptor, Shader, ShaderStages, SpecializedComputePipeline, + SpecializedComputePipelines, StorageTextureAccess, TextureAspect, TextureDescriptor, + TextureDimension, TextureFormat, TextureSampleType, TextureUsages, TextureView, + TextureViewDescriptor, TextureViewDimension, + }, + renderer::{RenderAdapter, RenderContext, RenderDevice}, + texture::TextureCache, + view::{ExtractedView, NoIndirectDrawing, ViewDepthTexture}, + Render, RenderApp, RenderSet, +}; +use bitflags::bitflags; + +use crate::{ + core_3d::{ + graph::{Core3d, Node3d}, + prepare_core_3d_depth_textures, + }, + prepass::{DeferredPrepass, DepthPrepass}, +}; + +/// Identifies the `downsample_depth.wgsl` shader. +pub const DOWNSAMPLE_DEPTH_SHADER_HANDLE: Handle = + Handle::weak_from_u128(3876351454330663524); + +/// The maximum number of mip levels that we can produce. +/// +/// 2^12 is 4096, so that's the maximum size of the depth buffer that we +/// support. +pub const DEPTH_PYRAMID_MIP_COUNT: usize = 12; + +/// A plugin that allows Bevy to repeatedly downsample textures to create +/// mipmaps. +/// +/// Currently, this is only used for hierarchical Z buffer generation for the +/// purposes of occlusion culling. +pub struct MipGenerationPlugin; + +impl Plugin for MipGenerationPlugin { + fn build(&self, app: &mut App) { + load_internal_asset!( + app, + DOWNSAMPLE_DEPTH_SHADER_HANDLE, + "downsample_depth.wgsl", + Shader::from_wgsl + ); + + let Some(render_app) = app.get_sub_app_mut(RenderApp) else { + return; + }; + + render_app + .init_resource::>() + .add_render_graph_node::>( + Core3d, + Node3d::EarlyDownsampleDepth, + ) + .add_render_graph_node::>( + Core3d, + Node3d::LateDownsampleDepth, + ) + .add_render_graph_edges( + Core3d, + ( + Node3d::EarlyPrepass, + Node3d::EarlyDownsampleDepth, + Node3d::LatePrepass, + Node3d::DeferredPrepass, + ), + ) + .add_render_graph_edges( + Core3d, + ( + Node3d::EndMainPass, + Node3d::LateDownsampleDepth, + Node3d::EndMainPassPostProcessing, + ), + ) + .add_systems( + Render, + create_downsample_depth_pipelines.in_set(RenderSet::Prepare), + ) + .add_systems( + Render, + ( + prepare_view_depth_pyramids, + prepare_downsample_depth_view_bind_groups, + ) + .chain() + .in_set(RenderSet::PrepareResources) + .run_if(resource_exists::) + .after(prepare_core_3d_depth_textures), + ); + } + + fn finish(&self, app: &mut App) { + let Some(render_app) = app.get_sub_app_mut(RenderApp) else { + return; + }; + render_app.init_resource::(); + } +} + +/// The nodes that produce a hierarchical Z-buffer, also known as a depth +/// pyramid. +/// +/// This runs the single-pass downsampling (SPD) shader with the *min* filter in +/// order to generate a series of mipmaps for the Z buffer. The resulting +/// hierarchical Z buffer can be used for occlusion culling. +/// +/// There are two instances of this node. The *early* downsample depth pass is +/// the first hierarchical Z-buffer stage, which runs after the early prepass +/// and before the late prepass. It prepares the Z-buffer for the bounding box +/// tests that the late mesh preprocessing stage will perform. The *late* +/// downsample depth pass runs at the end of the main phase. It prepares the +/// Z-buffer for the occlusion culling that the early mesh preprocessing phase +/// of the *next* frame will perform. +/// +/// This node won't do anything if occlusion culling isn't on. +#[derive(Default)] +pub struct DownsampleDepthNode; + +impl ViewNode for DownsampleDepthNode { + type ViewQuery = ( + Read, + Read, + Read, + ); + + fn run<'w>( + &self, + render_graph_context: &mut RenderGraphContext, + render_context: &mut RenderContext<'w>, + (view_depth_pyramid, view_downsample_depth_bind_group, view_depth_texture): QueryItem< + 'w, + Self::ViewQuery, + >, + world: &'w World, + ) -> Result<(), NodeRunError> { + // Produce a depth pyramid from the current depth buffer for a single + // view. The resulting depth pyramid can be used for occlusion testing. + + let downsample_depth_pipelines = world.resource::(); + let pipeline_cache = world.resource::(); + + // Despite the name "single-pass downsampling", we actually need two + // passes because of the lack of `coherent` buffers in WGPU/WGSL. + // Between each pass, there's an implicit synchronization barrier. + + // Fetch the appropriate pipeline ID, depending on whether the depth + // buffer is multisampled or not. + let (Some(first_downsample_depth_pipeline_id), Some(second_downsample_depth_pipeline_id)) = + (if view_depth_texture.texture.sample_count() > 1 { + ( + downsample_depth_pipelines.first_multisample.pipeline_id, + downsample_depth_pipelines.second_multisample.pipeline_id, + ) + } else { + ( + downsample_depth_pipelines.first.pipeline_id, + downsample_depth_pipelines.second.pipeline_id, + ) + }) + else { + return Ok(()); + }; + + // Fetch the pipelines for the two passes. + let (Some(first_downsample_depth_pipeline), Some(second_downsample_depth_pipeline)) = ( + pipeline_cache.get_compute_pipeline(first_downsample_depth_pipeline_id), + pipeline_cache.get_compute_pipeline(second_downsample_depth_pipeline_id), + ) else { + return Ok(()); + }; + + // Run the depth downsampling. + let view_size = uvec2( + view_depth_texture.texture.width(), + view_depth_texture.texture.height(), + ); + view_depth_pyramid.downsample_depth( + &format!("{:?}", render_graph_context.label()), + render_context, + view_size, + view_downsample_depth_bind_group, + first_downsample_depth_pipeline, + second_downsample_depth_pipeline, + ); + Ok(()) + } +} + +/// A single depth downsample pipeline. +#[derive(Resource)] +pub struct DownsampleDepthPipeline { + /// The bind group layout for this pipeline. + bind_group_layout: BindGroupLayout, + /// A handle that identifies the compiled shader. + pipeline_id: Option, +} + +impl DownsampleDepthPipeline { + /// Creates a new [`DownsampleDepthPipeline`] from a bind group layout. + /// + /// This doesn't actually specialize the pipeline; that must be done + /// afterward. + fn new(bind_group_layout: BindGroupLayout) -> DownsampleDepthPipeline { + DownsampleDepthPipeline { + bind_group_layout, + pipeline_id: None, + } + } +} + +/// Stores all depth buffer downsampling pipelines. +#[derive(Resource)] +pub struct DownsampleDepthPipelines { + /// The first pass of the pipeline, when the depth buffer is *not* + /// multisampled. + first: DownsampleDepthPipeline, + /// The second pass of the pipeline, when the depth buffer is *not* + /// multisampled. + second: DownsampleDepthPipeline, + /// The first pass of the pipeline, when the depth buffer is multisampled. + first_multisample: DownsampleDepthPipeline, + /// The second pass of the pipeline, when the depth buffer is multisampled. + second_multisample: DownsampleDepthPipeline, + /// The sampler that the depth downsampling shader uses to sample the depth + /// buffer. + sampler: Sampler, +} + +/// Creates the [`DownsampleDepthPipelines`] if downsampling is supported on the +/// current platform. +fn create_downsample_depth_pipelines( + mut commands: Commands, + render_device: Res, + render_adapter: Res, + pipeline_cache: Res, + mut specialized_compute_pipelines: ResMut>, + mut has_run: Local, +) { + // Only run once. + // We can't use a `resource_exists` or similar run condition here because + // this function might fail to create downsample depth pipelines if the + // current platform doesn't support compute shaders. + if *has_run { + return; + } + *has_run = true; + + // If we don't have compute shaders, we can't invoke the downsample depth + // compute shader. + if !render_adapter + .get_downlevel_capabilities() + .flags + .contains(DownlevelFlags::COMPUTE_SHADERS) + { + return; + } + + // Create the bind group layouts. The bind group layouts are identical + // between the first and second passes, so the only thing we need to + // treat specially is the type of the first mip level (non-multisampled + // or multisampled). + let standard_bind_group_layout = + create_downsample_depth_bind_group_layout(&render_device, false); + let multisampled_bind_group_layout = + create_downsample_depth_bind_group_layout(&render_device, true); + + // Create the depth pyramid sampler. This is shared among all shaders. + let sampler = render_device.create_sampler(&SamplerDescriptor { + label: Some("depth pyramid sampler"), + ..SamplerDescriptor::default() + }); + + // Initialize the pipelines. + let mut downsample_depth_pipelines = DownsampleDepthPipelines { + first: DownsampleDepthPipeline::new(standard_bind_group_layout.clone()), + second: DownsampleDepthPipeline::new(standard_bind_group_layout.clone()), + first_multisample: DownsampleDepthPipeline::new(multisampled_bind_group_layout.clone()), + second_multisample: DownsampleDepthPipeline::new(multisampled_bind_group_layout.clone()), + sampler, + }; + + // Specialize each pipeline with the appropriate + // `DownsampleDepthPipelineKey`. + downsample_depth_pipelines.first.pipeline_id = Some(specialized_compute_pipelines.specialize( + &pipeline_cache, + &downsample_depth_pipelines.first, + DownsampleDepthPipelineKey::empty(), + )); + downsample_depth_pipelines.second.pipeline_id = Some(specialized_compute_pipelines.specialize( + &pipeline_cache, + &downsample_depth_pipelines.second, + DownsampleDepthPipelineKey::SECOND_PHASE, + )); + downsample_depth_pipelines.first_multisample.pipeline_id = + Some(specialized_compute_pipelines.specialize( + &pipeline_cache, + &downsample_depth_pipelines.first_multisample, + DownsampleDepthPipelineKey::MULTISAMPLE, + )); + downsample_depth_pipelines.second_multisample.pipeline_id = + Some(specialized_compute_pipelines.specialize( + &pipeline_cache, + &downsample_depth_pipelines.second_multisample, + DownsampleDepthPipelineKey::SECOND_PHASE | DownsampleDepthPipelineKey::MULTISAMPLE, + )); + + commands.insert_resource(downsample_depth_pipelines); +} + +/// Creates a single bind group layout for the downsample depth pass. +fn create_downsample_depth_bind_group_layout( + render_device: &RenderDevice, + is_multisampled: bool, +) -> BindGroupLayout { + render_device.create_bind_group_layout( + if is_multisampled { + "downsample multisample depth bind group layout" + } else { + "downsample depth bind group layout" + }, + &BindGroupLayoutEntries::sequential( + ShaderStages::COMPUTE, + ( + // We only care about the multisample status of the depth buffer + // for the first mip level. After the first mip level is + // sampled, we drop to a single sample. + if is_multisampled { + texture_2d_multisampled(TextureSampleType::Depth) + } else { + texture_2d(TextureSampleType::Depth) + }, + // All the mip levels follow: + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::ReadWrite), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + texture_storage_2d(TextureFormat::R32Float, StorageTextureAccess::WriteOnly), + sampler(SamplerBindingType::NonFiltering), + ), + ), + ) +} + +bitflags! { + /// Uniquely identifies a configuration of the downsample depth shader. + /// + /// Note that meshlets maintain their downsample depth shaders on their own + /// and don't use this infrastructure; thus there's no flag for meshlets in + /// here, even though the shader has defines for it. + #[derive(Clone, Copy, PartialEq, Eq, Hash)] + pub struct DownsampleDepthPipelineKey: u8 { + /// True if the depth buffer is multisampled. + const MULTISAMPLE = 1; + /// True if this shader is the second phase of the downsample depth + /// process; false if this shader is the first phase. + const SECOND_PHASE = 2; + } +} + +impl SpecializedComputePipeline for DownsampleDepthPipeline { + type Key = DownsampleDepthPipelineKey; + + fn specialize(&self, key: Self::Key) -> ComputePipelineDescriptor { + let mut shader_defs = vec![]; + if key.contains(DownsampleDepthPipelineKey::MULTISAMPLE) { + shader_defs.push("MULTISAMPLE".into()); + } + + let label = format!( + "downsample depth{}{} pipeline", + if key.contains(DownsampleDepthPipelineKey::MULTISAMPLE) { + " multisample" + } else { + "" + }, + if key.contains(DownsampleDepthPipelineKey::SECOND_PHASE) { + " second phase" + } else { + " first phase" + } + ) + .into(); + + ComputePipelineDescriptor { + label: Some(label), + layout: vec![self.bind_group_layout.clone()], + push_constant_ranges: vec![PushConstantRange { + stages: ShaderStages::COMPUTE, + range: 0..8, + }], + shader: DOWNSAMPLE_DEPTH_SHADER_HANDLE, + shader_defs, + entry_point: if key.contains(DownsampleDepthPipelineKey::SECOND_PHASE) { + "downsample_depth_second".into() + } else { + "downsample_depth_first".into() + }, + zero_initialize_workgroup_memory: false, + } + } +} + +/// Stores a placeholder texture that can be bound to a depth pyramid binding if +/// no depth pyramid is needed. +#[derive(Resource, Deref, DerefMut)] +pub struct DepthPyramidDummyTexture(TextureView); + +impl FromWorld for DepthPyramidDummyTexture { + fn from_world(world: &mut World) -> Self { + let render_device = world.resource::(); + + DepthPyramidDummyTexture(create_depth_pyramid_dummy_texture( + render_device, + "depth pyramid dummy texture", + "depth pyramid dummy texture view", + )) + } +} + +/// Creates a placeholder texture that can be bound to a depth pyramid binding +/// if no depth pyramid is needed. +pub fn create_depth_pyramid_dummy_texture( + render_device: &RenderDevice, + texture_label: &'static str, + texture_view_label: &'static str, +) -> TextureView { + render_device + .create_texture(&TextureDescriptor { + label: Some(texture_label), + size: Extent3d { + width: 1, + height: 1, + depth_or_array_layers: 1, + }, + mip_level_count: 1, + sample_count: 1, + dimension: TextureDimension::D2, + format: TextureFormat::R32Float, + usage: TextureUsages::STORAGE_BINDING, + view_formats: &[], + }) + .create_view(&TextureViewDescriptor { + label: Some(texture_view_label), + format: Some(TextureFormat::R32Float), + dimension: Some(TextureViewDimension::D2), + aspect: TextureAspect::All, + base_mip_level: 0, + mip_level_count: Some(1), + base_array_layer: 0, + array_layer_count: Some(1), + }) +} + +/// Stores a hierarchical Z-buffer for a view, which is a series of mipmaps +/// useful for efficient occlusion culling. +/// +/// This will only be present on a view when occlusion culling is enabled. +#[derive(Component)] +pub struct ViewDepthPyramid { + /// A texture view containing the entire depth texture. + pub all_mips: TextureView, + /// A series of texture views containing one mip level each. + pub mips: [TextureView; DEPTH_PYRAMID_MIP_COUNT], + /// The total number of mipmap levels. + /// + /// This is the base-2 logarithm of the greatest dimension of the depth + /// buffer, rounded up. + pub mip_count: u32, +} + +impl ViewDepthPyramid { + /// Allocates a new depth pyramid for a depth buffer with the given size. + pub fn new( + render_device: &RenderDevice, + texture_cache: &mut TextureCache, + depth_pyramid_dummy_texture: &TextureView, + size: UVec2, + texture_label: &'static str, + texture_view_label: &'static str, + ) -> ViewDepthPyramid { + // Calculate the size of the depth pyramid. + let depth_pyramid_size = Extent3d { + width: size.x.div_ceil(2), + height: size.y.div_ceil(2), + depth_or_array_layers: 1, + }; + + // Calculate the number of mip levels we need. + let depth_pyramid_mip_count = depth_pyramid_size.max_mips(TextureDimension::D2); + + // Create the depth pyramid. + let depth_pyramid = texture_cache.get( + render_device, + TextureDescriptor { + label: Some(texture_label), + size: depth_pyramid_size, + mip_level_count: depth_pyramid_mip_count, + sample_count: 1, + dimension: TextureDimension::D2, + format: TextureFormat::R32Float, + usage: TextureUsages::STORAGE_BINDING | TextureUsages::TEXTURE_BINDING, + view_formats: &[], + }, + ); + + // Create individual views for each level of the depth pyramid. + let depth_pyramid_mips = array::from_fn(|i| { + if (i as u32) < depth_pyramid_mip_count { + depth_pyramid.texture.create_view(&TextureViewDescriptor { + label: Some(texture_view_label), + format: Some(TextureFormat::R32Float), + dimension: Some(TextureViewDimension::D2), + aspect: TextureAspect::All, + base_mip_level: i as u32, + mip_level_count: Some(1), + base_array_layer: 0, + array_layer_count: Some(1), + }) + } else { + (*depth_pyramid_dummy_texture).clone() + } + }); + + // Create the view for the depth pyramid as a whole. + let depth_pyramid_all_mips = depth_pyramid.default_view.clone(); + + Self { + all_mips: depth_pyramid_all_mips, + mips: depth_pyramid_mips, + mip_count: depth_pyramid_mip_count, + } + } + + /// Creates a bind group that allows the depth buffer to be attached to the + /// `downsample_depth.wgsl` shader. + pub fn create_bind_group<'a, R>( + &'a self, + render_device: &RenderDevice, + label: &'static str, + bind_group_layout: &BindGroupLayout, + source_image: R, + sampler: &'a Sampler, + ) -> BindGroup + where + R: IntoBinding<'a>, + { + render_device.create_bind_group( + label, + bind_group_layout, + &BindGroupEntries::sequential(( + source_image, + &self.mips[0], + &self.mips[1], + &self.mips[2], + &self.mips[3], + &self.mips[4], + &self.mips[5], + &self.mips[6], + &self.mips[7], + &self.mips[8], + &self.mips[9], + &self.mips[10], + &self.mips[11], + sampler, + )), + ) + } + + /// Invokes the shaders to generate the hierarchical Z-buffer. + /// + /// This is intended to be invoked as part of a render node. + pub fn downsample_depth( + &self, + label: &str, + render_context: &mut RenderContext, + view_size: UVec2, + downsample_depth_bind_group: &BindGroup, + downsample_depth_first_pipeline: &ComputePipeline, + downsample_depth_second_pipeline: &ComputePipeline, + ) { + let command_encoder = render_context.command_encoder(); + let mut downsample_pass = command_encoder.begin_compute_pass(&ComputePassDescriptor { + label: Some(label), + timestamp_writes: None, + }); + downsample_pass.set_pipeline(downsample_depth_first_pipeline); + // Pass the mip count and the texture width as push constants, for + // simplicity. + downsample_pass.set_push_constants(0, bytemuck::cast_slice(&[self.mip_count, view_size.x])); + downsample_pass.set_bind_group(0, downsample_depth_bind_group, &[]); + downsample_pass.dispatch_workgroups(view_size.x.div_ceil(64), view_size.y.div_ceil(64), 1); + + if self.mip_count >= 7 { + downsample_pass.set_pipeline(downsample_depth_second_pipeline); + downsample_pass.dispatch_workgroups(1, 1, 1); + } + } +} + +/// Creates depth pyramids for views that have occlusion culling enabled. +fn prepare_view_depth_pyramids( + mut commands: Commands, + render_device: Res, + mut texture_cache: ResMut, + depth_pyramid_dummy_texture: Res, + views: Query< + (Entity, &ExtractedView), + ( + With, + Without, + With, + Without, + ), + >, +) { + for (view_entity, view) in &views { + commands.entity(view_entity).insert(ViewDepthPyramid::new( + &render_device, + &mut texture_cache, + &depth_pyramid_dummy_texture, + view.viewport.zw(), + "view depth pyramid texture", + "view depth pyramid texture view", + )); + } +} + +/// The bind group that we use to attach the depth buffer and depth pyramid for +/// a view to the `downsample_depth.wgsl` shader. +/// +/// This will only be present for a view if occlusion culling is enabled. +#[derive(Component, Deref, DerefMut)] +pub struct ViewDownsampleDepthBindGroup(BindGroup); + +/// Creates the [`ViewDownsampleDepthBindGroup`]s for all views with occlusion +/// culling enabled. +fn prepare_downsample_depth_view_bind_groups( + mut commands: Commands, + render_device: Res, + downsample_depth_pipelines: Res, + view_depth_textures: Query<(Entity, &ViewDepthPyramid, &ViewDepthTexture)>, +) { + for (view_entity, view_depth_pyramid, view_depth_texture) in &view_depth_textures { + let is_multisampled = view_depth_texture.texture.sample_count() > 1; + commands + .entity(view_entity) + .insert(ViewDownsampleDepthBindGroup( + view_depth_pyramid.create_bind_group( + &render_device, + if is_multisampled { + "downsample multisample depth bind group" + } else { + "downsample depth bind group" + }, + if is_multisampled { + &downsample_depth_pipelines + .first_multisample + .bind_group_layout + } else { + &downsample_depth_pipelines.first.bind_group_layout + }, + view_depth_texture.view(), + &downsample_depth_pipelines.sampler, + ), + )); + } +} diff --git a/crates/bevy_core_pipeline/src/experimental/mod.rs b/crates/bevy_core_pipeline/src/experimental/mod.rs new file mode 100644 index 0000000000..4f957477ea --- /dev/null +++ b/crates/bevy_core_pipeline/src/experimental/mod.rs @@ -0,0 +1,11 @@ +//! Experimental rendering features. +//! +//! Experimental features are features with known problems, missing features, +//! compatibility issues, low performance, and/or future breaking changes, but +//! are included nonetheless for testing purposes. + +pub mod mip_generation; + +pub mod taa { + pub use crate::taa::{TemporalAntiAliasNode, TemporalAntiAliasPlugin, TemporalAntiAliasing}; +} diff --git a/crates/bevy_core_pipeline/src/lib.rs b/crates/bevy_core_pipeline/src/lib.rs index 6c2ee5bec4..49b9b7a20b 100644 --- a/crates/bevy_core_pipeline/src/lib.rs +++ b/crates/bevy_core_pipeline/src/lib.rs @@ -14,6 +14,7 @@ pub mod core_2d; pub mod core_3d; pub mod deferred; pub mod dof; +pub mod experimental; pub mod fullscreen_vertex_shader; pub mod fxaa; pub mod motion_blur; @@ -29,17 +30,6 @@ pub mod upscaling; pub use skybox::Skybox; -/// Experimental features that are not yet finished. Please report any issues you encounter! -/// -/// Expect bugs, missing features, compatibility issues, low performance, and/or future breaking changes. -pub mod experimental { - pub mod taa { - pub use crate::taa::{ - TemporalAntiAliasNode, TemporalAntiAliasPlugin, TemporalAntiAliasing, - }; - } -} - /// The core pipeline prelude. /// /// This includes the most common types in this crate, re-exported for your convenience. @@ -56,6 +46,7 @@ use crate::{ core_3d::Core3dPlugin, deferred::copy_lighting_id::CopyDeferredLightingIdPlugin, dof::DepthOfFieldPlugin, + experimental::mip_generation::MipGenerationPlugin, fullscreen_vertex_shader::FULLSCREEN_SHADER_HANDLE, fxaa::FxaaPlugin, motion_blur::MotionBlurPlugin, @@ -87,10 +78,8 @@ impl Plugin for CorePipelinePlugin { .register_type::() .register_type::() .register_type::() + .add_plugins((Core2dPlugin, Core3dPlugin, CopyDeferredLightingIdPlugin)) .add_plugins(( - Core2dPlugin, - Core3dPlugin, - CopyDeferredLightingIdPlugin, BlitPlugin, MsaaWritebackPlugin, TonemappingPlugin, @@ -103,6 +92,7 @@ impl Plugin for CorePipelinePlugin { SmaaPlugin, PostProcessingPlugin, OrderIndependentTransparencyPlugin, + MipGenerationPlugin, )); } } diff --git a/crates/bevy_core_pipeline/src/prepass/mod.rs b/crates/bevy_core_pipeline/src/prepass/mod.rs index 7fb2dfcea9..1e663a79a4 100644 --- a/crates/bevy_core_pipeline/src/prepass/mod.rs +++ b/crates/bevy_core_pipeline/src/prepass/mod.rs @@ -78,6 +78,7 @@ pub struct DeferredPrepass; pub struct PreviousViewData { pub view_from_world: Mat4, pub clip_from_world: Mat4, + pub clip_from_view: Mat4, } #[derive(Resource, Default)] diff --git a/crates/bevy_core_pipeline/src/prepass/node.rs b/crates/bevy_core_pipeline/src/prepass/node.rs index 9019890d7e..942983b063 100644 --- a/crates/bevy_core_pipeline/src/prepass/node.rs +++ b/crates/bevy_core_pipeline/src/prepass/node.rs @@ -2,11 +2,12 @@ use bevy_ecs::{prelude::*, query::QueryItem}; use bevy_render::{ camera::ExtractedCamera, diagnostic::RecordDiagnostics, + experimental::occlusion_culling::OcclusionCulling, render_graph::{NodeRunError, RenderGraphContext, ViewNode}, render_phase::{TrackedRenderPass, ViewBinnedRenderPhases}, render_resource::{CommandEncoderDescriptor, PipelineCache, RenderPassDescriptor, StoreOp}, renderer::RenderContext, - view::{ExtractedView, ViewDepthTexture, ViewUniformOffset}, + view::{ExtractedView, NoIndirectDrawing, ViewDepthTexture, ViewUniformOffset}, }; use tracing::error; #[cfg(feature = "trace")] @@ -19,13 +20,40 @@ use super::{ ViewPrepassTextures, }; -/// Render node used by the prepass. +/// The phase of the prepass that draws meshes that were visible last frame. /// -/// By default, inserted before the main pass in the render graph. +/// If occlusion culling isn't in use, this prepass simply draws all meshes. +/// +/// Like all prepass nodes, this is inserted before the main pass in the render +/// graph. #[derive(Default)] -pub struct PrepassNode; +pub struct EarlyPrepassNode; -impl ViewNode for PrepassNode { +impl ViewNode for EarlyPrepassNode { + type ViewQuery = ::ViewQuery; + + fn run<'w>( + &self, + graph: &mut RenderGraphContext, + render_context: &mut RenderContext<'w>, + view_query: QueryItem<'w, Self::ViewQuery>, + world: &'w World, + ) -> Result<(), NodeRunError> { + run_prepass(graph, render_context, view_query, world, "early prepass") + } +} + +/// The phase of the prepass that runs after occlusion culling against the +/// meshes that were visible last frame. +/// +/// If occlusion culling isn't in use, this is a no-op. +/// +/// Like all prepass nodes, this is inserted before the main pass in the render +/// graph. +#[derive(Default)] +pub struct LatePrepassNode; + +impl ViewNode for LatePrepassNode { type ViewQuery = ( &'static ExtractedCamera, &'static ExtractedView, @@ -36,156 +64,176 @@ impl ViewNode for PrepassNode { Option<&'static RenderSkyboxPrepassPipeline>, Option<&'static SkyboxPrepassBindGroup>, Option<&'static PreviousViewUniformOffset>, + Has, + Has, ); fn run<'w>( &self, graph: &mut RenderGraphContext, render_context: &mut RenderContext<'w>, - ( - camera, - extracted_view, - view_depth_texture, - view_prepass_textures, - view_uniform_offset, - deferred_prepass, + query: QueryItem<'w, Self::ViewQuery>, + world: &'w World, + ) -> Result<(), NodeRunError> { + // We only need a late prepass if we have occlusion culling and indirect + // drawing. + let (_, _, _, _, _, _, _, _, _, occlusion_culling, no_indirect_drawing) = query; + if !occlusion_culling || no_indirect_drawing { + return Ok(()); + } + + run_prepass(graph, render_context, query, world, "late prepass") + } +} + +/// Runs a prepass that draws all meshes to the depth buffer, and possibly +/// normal and motion vector buffers as well. +/// +/// If occlusion culling isn't in use, and a prepass is enabled, then there's +/// only one prepass. If occlusion culling is in use, then any prepass is split +/// into two: an *early* prepass and a *late* prepass. The early prepass draws +/// what was visible last frame, and the last prepass performs occlusion culling +/// against a conservative hierarchical Z buffer before drawing unoccluded +/// meshes. +fn run_prepass<'w>( + graph: &mut RenderGraphContext, + render_context: &mut RenderContext<'w>, + ( + camera, + extracted_view, + view_depth_texture, + view_prepass_textures, + view_uniform_offset, + deferred_prepass, + skybox_prepass_pipeline, + skybox_prepass_bind_group, + view_prev_uniform_offset, + _, + _, + ): QueryItem<'w, ::ViewQuery>, + world: &'w World, + label: &'static str, +) -> Result<(), NodeRunError> { + let (Some(opaque_prepass_phases), Some(alpha_mask_prepass_phases)) = ( + world.get_resource::>(), + world.get_resource::>(), + ) else { + return Ok(()); + }; + + let (Some(opaque_prepass_phase), Some(alpha_mask_prepass_phase)) = ( + opaque_prepass_phases.get(&extracted_view.retained_view_entity), + alpha_mask_prepass_phases.get(&extracted_view.retained_view_entity), + ) else { + return Ok(()); + }; + + let diagnostics = render_context.diagnostic_recorder(); + + let mut color_attachments = vec![ + view_prepass_textures + .normal + .as_ref() + .map(|normals_texture| normals_texture.get_attachment()), + view_prepass_textures + .motion_vectors + .as_ref() + .map(|motion_vectors_texture| motion_vectors_texture.get_attachment()), + // Use None in place of deferred attachments + None, + None, + ]; + + // If all color attachments are none: clear the color attachment list so that no fragment shader is required + if color_attachments.iter().all(Option::is_none) { + color_attachments.clear(); + } + + let depth_stencil_attachment = Some(view_depth_texture.get_attachment(StoreOp::Store)); + + let view_entity = graph.view_entity(); + render_context.add_command_buffer_generation_task(move |render_device| { + #[cfg(feature = "trace")] + let _prepass_span = info_span!("prepass").entered(); + + // Command encoder setup + let mut command_encoder = render_device.create_command_encoder(&CommandEncoderDescriptor { + label: Some("prepass_command_encoder"), + }); + + // Render pass setup + let render_pass = command_encoder.begin_render_pass(&RenderPassDescriptor { + label: Some(label), + color_attachments: &color_attachments, + depth_stencil_attachment, + timestamp_writes: None, + occlusion_query_set: None, + }); + + let mut render_pass = TrackedRenderPass::new(&render_device, render_pass); + let pass_span = diagnostics.pass_span(&mut render_pass, label); + + if let Some(viewport) = camera.viewport.as_ref() { + render_pass.set_camera_viewport(viewport); + } + + // Opaque draws + if !opaque_prepass_phase.is_empty() { + #[cfg(feature = "trace")] + let _opaque_prepass_span = info_span!("opaque_prepass").entered(); + if let Err(err) = opaque_prepass_phase.render(&mut render_pass, world, view_entity) { + error!("Error encountered while rendering the opaque prepass phase {err:?}"); + } + } + + // Alpha masked draws + if !alpha_mask_prepass_phase.is_empty() { + #[cfg(feature = "trace")] + let _alpha_mask_prepass_span = info_span!("alpha_mask_prepass").entered(); + if let Err(err) = alpha_mask_prepass_phase.render(&mut render_pass, world, view_entity) + { + error!("Error encountered while rendering the alpha mask prepass phase {err:?}"); + } + } + + // Skybox draw using a fullscreen triangle + if let ( + Some(skybox_prepass_pipeline), + Some(skybox_prepass_bind_group), + Some(view_prev_uniform_offset), + ) = ( skybox_prepass_pipeline, skybox_prepass_bind_group, view_prev_uniform_offset, - ): QueryItem<'w, Self::ViewQuery>, - world: &'w World, - ) -> Result<(), NodeRunError> { - let (Some(opaque_prepass_phases), Some(alpha_mask_prepass_phases)) = ( - world.get_resource::>(), - world.get_resource::>(), - ) else { - return Ok(()); - }; - - let (Some(opaque_prepass_phase), Some(alpha_mask_prepass_phase)) = ( - opaque_prepass_phases.get(&extracted_view.retained_view_entity), - alpha_mask_prepass_phases.get(&extracted_view.retained_view_entity), - ) else { - return Ok(()); - }; - - let diagnostics = render_context.diagnostic_recorder(); - - let mut color_attachments = vec![ - view_prepass_textures - .normal - .as_ref() - .map(|normals_texture| normals_texture.get_attachment()), - view_prepass_textures - .motion_vectors - .as_ref() - .map(|motion_vectors_texture| motion_vectors_texture.get_attachment()), - // Use None in place of deferred attachments - None, - None, - ]; - - // If all color attachments are none: clear the color attachment list so that no fragment shader is required - if color_attachments.iter().all(Option::is_none) { - color_attachments.clear(); + ) { + let pipeline_cache = world.resource::(); + if let Some(pipeline) = pipeline_cache.get_render_pipeline(skybox_prepass_pipeline.0) { + render_pass.set_render_pipeline(pipeline); + render_pass.set_bind_group( + 0, + &skybox_prepass_bind_group.0, + &[view_uniform_offset.offset, view_prev_uniform_offset.offset], + ); + render_pass.draw(0..3, 0..1); + } } - let depth_stencil_attachment = Some(view_depth_texture.get_attachment(StoreOp::Store)); + pass_span.end(&mut render_pass); + drop(render_pass); - let view_entity = graph.view_entity(); - render_context.add_command_buffer_generation_task(move |render_device| { - #[cfg(feature = "trace")] - let _prepass_span = info_span!("prepass").entered(); - - // Command encoder setup - let mut command_encoder = - render_device.create_command_encoder(&CommandEncoderDescriptor { - label: Some("prepass_command_encoder"), - }); - - // Render pass setup - let render_pass = command_encoder.begin_render_pass(&RenderPassDescriptor { - label: Some("prepass"), - color_attachments: &color_attachments, - depth_stencil_attachment, - timestamp_writes: None, - occlusion_query_set: None, - }); - - let mut render_pass = TrackedRenderPass::new(&render_device, render_pass); - let pass_span = diagnostics.pass_span(&mut render_pass, "prepass"); - - if let Some(viewport) = camera.viewport.as_ref() { - render_pass.set_camera_viewport(viewport); + // After rendering to the view depth texture, copy it to the prepass depth texture if deferred isn't going to + if deferred_prepass.is_none() { + if let Some(prepass_depth_texture) = &view_prepass_textures.depth { + command_encoder.copy_texture_to_texture( + view_depth_texture.texture.as_image_copy(), + prepass_depth_texture.texture.texture.as_image_copy(), + view_prepass_textures.size, + ); } + } - // Opaque draws - if !opaque_prepass_phase.multidrawable_mesh_keys.is_empty() - || !opaque_prepass_phase.batchable_mesh_keys.is_empty() - || !opaque_prepass_phase.unbatchable_mesh_keys.is_empty() - { - #[cfg(feature = "trace")] - let _opaque_prepass_span = info_span!("opaque_prepass").entered(); - if let Err(err) = opaque_prepass_phase.render(&mut render_pass, world, view_entity) - { - error!("Error encountered while rendering the opaque prepass phase {err:?}"); - } - } + command_encoder.finish() + }); - // Alpha masked draws - if !alpha_mask_prepass_phase.is_empty() { - #[cfg(feature = "trace")] - let _alpha_mask_prepass_span = info_span!("alpha_mask_prepass").entered(); - if let Err(err) = - alpha_mask_prepass_phase.render(&mut render_pass, world, view_entity) - { - error!( - "Error encountered while rendering the alpha mask prepass phase {err:?}" - ); - } - } - - // Skybox draw using a fullscreen triangle - if let ( - Some(skybox_prepass_pipeline), - Some(skybox_prepass_bind_group), - Some(view_prev_uniform_offset), - ) = ( - skybox_prepass_pipeline, - skybox_prepass_bind_group, - view_prev_uniform_offset, - ) { - let pipeline_cache = world.resource::(); - if let Some(pipeline) = - pipeline_cache.get_render_pipeline(skybox_prepass_pipeline.0) - { - render_pass.set_render_pipeline(pipeline); - render_pass.set_bind_group( - 0, - &skybox_prepass_bind_group.0, - &[view_uniform_offset.offset, view_prev_uniform_offset.offset], - ); - render_pass.draw(0..3, 0..1); - } - } - - pass_span.end(&mut render_pass); - drop(render_pass); - - // After rendering to the view depth texture, copy it to the prepass depth texture if deferred isn't going to - if deferred_prepass.is_none() { - if let Some(prepass_depth_texture) = &view_prepass_textures.depth { - command_encoder.copy_texture_to_texture( - view_depth_texture.texture.as_image_copy(), - prepass_depth_texture.texture.texture.as_image_copy(), - view_prepass_textures.size, - ); - } - } - - command_encoder.finish() - }); - - Ok(()) - } + Ok(()) } diff --git a/crates/bevy_pbr/src/lib.rs b/crates/bevy_pbr/src/lib.rs index 44537faf9b..089f38d47d 100644 --- a/crates/bevy_pbr/src/lib.rs +++ b/crates/bevy_pbr/src/lib.rs @@ -98,11 +98,13 @@ pub mod graph { /// Label for the volumetric lighting pass. VolumetricFog, /// Label for the compute shader instance data building pass. - GpuPreprocess, + EarlyGpuPreprocess, + LateGpuPreprocess, /// Label for the screen space reflections pass. ScreenSpaceReflections, - /// Label for the indirect parameters building pass. - BuildIndirectParameters, + EarlyPrepassBuildIndirectParameters, + LatePrepassBuildIndirectParameters, + MainBuildIndirectParameters, } } diff --git a/crates/bevy_pbr/src/meshlet/mod.rs b/crates/bevy_pbr/src/meshlet/mod.rs index 5db0644f97..862db413f8 100644 --- a/crates/bevy_pbr/src/meshlet/mod.rs +++ b/crates/bevy_pbr/src/meshlet/mod.rs @@ -175,12 +175,6 @@ impl Plugin for MeshletPlugin { "cull_clusters.wgsl", Shader::from_wgsl ); - load_internal_asset!( - app, - MESHLET_DOWNSAMPLE_DEPTH_SHADER_HANDLE, - "downsample_depth.wgsl", - Shader::from_wgsl - ); load_internal_asset!( app, MESHLET_VISIBILITY_BUFFER_SOFTWARE_RASTER_SHADER_HANDLE, @@ -255,7 +249,6 @@ impl Plugin for MeshletPlugin { NodePbr::ShadowPass, // NodeMeshlet::Prepass, - Node3d::Prepass, // NodeMeshlet::DeferredPrepass, Node3d::DeferredPrepass, diff --git a/crates/bevy_pbr/src/meshlet/pipelines.rs b/crates/bevy_pbr/src/meshlet/pipelines.rs index 2d271c678d..e5e8ec44d8 100644 --- a/crates/bevy_pbr/src/meshlet/pipelines.rs +++ b/crates/bevy_pbr/src/meshlet/pipelines.rs @@ -1,7 +1,8 @@ use super::resource_manager::ResourceManager; use bevy_asset::Handle; use bevy_core_pipeline::{ - core_3d::CORE_3D_DEPTH_FORMAT, fullscreen_vertex_shader::fullscreen_shader_vertex_state, + core_3d::CORE_3D_DEPTH_FORMAT, experimental::mip_generation::DOWNSAMPLE_DEPTH_SHADER_HANDLE, + fullscreen_vertex_shader::fullscreen_shader_vertex_state, }; use bevy_ecs::{ resource::Resource, @@ -12,8 +13,6 @@ use bevy_render::render_resource::*; pub const MESHLET_FILL_CLUSTER_BUFFERS_SHADER_HANDLE: Handle = Handle::weak_from_u128(4325134235233421); pub const MESHLET_CULLING_SHADER_HANDLE: Handle = Handle::weak_from_u128(5325134235233421); -pub const MESHLET_DOWNSAMPLE_DEPTH_SHADER_HANDLE: Handle = - Handle::weak_from_u128(6325134235233421); pub const MESHLET_VISIBILITY_BUFFER_SOFTWARE_RASTER_SHADER_HANDLE: Handle = Handle::weak_from_u128(7325134235233421); pub const MESHLET_VISIBILITY_BUFFER_HARDWARE_RASTER_SHADER_HANDLE: Handle = @@ -119,8 +118,11 @@ impl FromWorld for MeshletPipelines { stages: ShaderStages::COMPUTE, range: 0..8, }], - shader: MESHLET_DOWNSAMPLE_DEPTH_SHADER_HANDLE, - shader_defs: vec!["MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT".into()], + shader: DOWNSAMPLE_DEPTH_SHADER_HANDLE, + shader_defs: vec![ + "MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT".into(), + "MESHLET".into(), + ], entry_point: "downsample_depth_first".into(), zero_initialize_workgroup_memory: false, }, @@ -134,8 +136,11 @@ impl FromWorld for MeshletPipelines { stages: ShaderStages::COMPUTE, range: 0..8, }], - shader: MESHLET_DOWNSAMPLE_DEPTH_SHADER_HANDLE, - shader_defs: vec!["MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT".into()], + shader: DOWNSAMPLE_DEPTH_SHADER_HANDLE, + shader_defs: vec![ + "MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT".into(), + "MESHLET".into(), + ], entry_point: "downsample_depth_second".into(), zero_initialize_workgroup_memory: false, }, @@ -149,8 +154,8 @@ impl FromWorld for MeshletPipelines { stages: ShaderStages::COMPUTE, range: 0..8, }], - shader: MESHLET_DOWNSAMPLE_DEPTH_SHADER_HANDLE, - shader_defs: vec![], + shader: DOWNSAMPLE_DEPTH_SHADER_HANDLE, + shader_defs: vec!["MESHLET".into()], entry_point: "downsample_depth_first".into(), zero_initialize_workgroup_memory: false, }, @@ -164,8 +169,8 @@ impl FromWorld for MeshletPipelines { stages: ShaderStages::COMPUTE, range: 0..8, }], - shader: MESHLET_DOWNSAMPLE_DEPTH_SHADER_HANDLE, - shader_defs: vec![], + shader: DOWNSAMPLE_DEPTH_SHADER_HANDLE, + shader_defs: vec!["MESHLET".into()], entry_point: "downsample_depth_second".into(), zero_initialize_workgroup_memory: false, }, diff --git a/crates/bevy_pbr/src/meshlet/resource_manager.rs b/crates/bevy_pbr/src/meshlet/resource_manager.rs index 653c90f162..b2d6cff11d 100644 --- a/crates/bevy_pbr/src/meshlet/resource_manager.rs +++ b/crates/bevy_pbr/src/meshlet/resource_manager.rs @@ -3,6 +3,7 @@ use crate::ShadowView; use alloc::sync::Arc; use bevy_core_pipeline::{ core_3d::Camera3d, + experimental::mip_generation::{self, ViewDepthPyramid}, prepass::{PreviousViewData, PreviousViewUniforms}, }; use bevy_ecs::{ @@ -20,7 +21,7 @@ use bevy_render::{ view::{ExtractedView, RenderLayers, ViewUniform, ViewUniforms}, }; use binding_types::*; -use core::{array, iter, sync::atomic::AtomicBool}; +use core::{iter, sync::atomic::AtomicBool}; use encase::internal::WriteInto; /// Manages per-view and per-cluster GPU resources for [`super::MeshletPlugin`]. @@ -85,31 +86,11 @@ impl ResourceManager { label: Some("meshlet_depth_pyramid_sampler"), ..SamplerDescriptor::default() }), - depth_pyramid_dummy_texture: render_device - .create_texture(&TextureDescriptor { - label: Some("meshlet_depth_pyramid_dummy_texture"), - size: Extent3d { - width: 1, - height: 1, - depth_or_array_layers: 1, - }, - mip_level_count: 1, - sample_count: 1, - dimension: TextureDimension::D2, - format: TextureFormat::R32Float, - usage: TextureUsages::STORAGE_BINDING, - view_formats: &[], - }) - .create_view(&TextureViewDescriptor { - label: Some("meshlet_depth_pyramid_dummy_texture_view"), - format: Some(TextureFormat::R32Float), - dimension: Some(TextureViewDimension::D2), - aspect: TextureAspect::All, - base_mip_level: 0, - mip_level_count: Some(1), - base_array_layer: 0, - array_layer_count: Some(1), - }), + depth_pyramid_dummy_texture: mip_generation::create_depth_pyramid_dummy_texture( + render_device, + "meshlet_depth_pyramid_dummy_texture", + "meshlet_depth_pyramid_dummy_texture_view", + ), previous_depth_pyramids: EntityHashMap::default(), @@ -258,9 +239,7 @@ pub struct MeshletViewResources { pub visibility_buffer_software_raster_indirect_args_second: Buffer, pub visibility_buffer_hardware_raster_indirect_args_first: Buffer, pub visibility_buffer_hardware_raster_indirect_args_second: Buffer, - depth_pyramid_all_mips: TextureView, - depth_pyramid_mips: [TextureView; 12], - pub depth_pyramid_mip_count: u32, + pub depth_pyramid: ViewDepthPyramid, previous_depth_pyramid: TextureView, pub material_depth: Option, pub view_size: UVec2, @@ -491,51 +470,23 @@ pub fn prepare_meshlet_per_frame_resources( usage: BufferUsages::STORAGE | BufferUsages::INDIRECT, }); - let depth_pyramid_size = Extent3d { - width: view.viewport.z.div_ceil(2), - height: view.viewport.w.div_ceil(2), - depth_or_array_layers: 1, - }; - let depth_pyramid_mip_count = depth_pyramid_size.max_mips(TextureDimension::D2); - let depth_pyramid = texture_cache.get( + let depth_pyramid = ViewDepthPyramid::new( &render_device, - TextureDescriptor { - label: Some("meshlet_depth_pyramid"), - size: depth_pyramid_size, - mip_level_count: depth_pyramid_mip_count, - sample_count: 1, - dimension: TextureDimension::D2, - format: TextureFormat::R32Float, - usage: TextureUsages::STORAGE_BINDING | TextureUsages::TEXTURE_BINDING, - view_formats: &[], - }, + &mut texture_cache, + &resource_manager.depth_pyramid_dummy_texture, + view.viewport.zw(), + "meshlet_depth_pyramid", + "meshlet_depth_pyramid_texture_view", ); - let depth_pyramid_mips = array::from_fn(|i| { - if (i as u32) < depth_pyramid_mip_count { - depth_pyramid.texture.create_view(&TextureViewDescriptor { - label: Some("meshlet_depth_pyramid_texture_view"), - format: Some(TextureFormat::R32Float), - dimension: Some(TextureViewDimension::D2), - aspect: TextureAspect::All, - base_mip_level: i as u32, - mip_level_count: Some(1), - base_array_layer: 0, - array_layer_count: Some(1), - }) - } else { - resource_manager.depth_pyramid_dummy_texture.clone() - } - }); - let depth_pyramid_all_mips = depth_pyramid.default_view.clone(); let previous_depth_pyramid = match resource_manager.previous_depth_pyramids.get(&view_entity) { Some(texture_view) => texture_view.clone(), - None => depth_pyramid_all_mips.clone(), + None => depth_pyramid.all_mips.clone(), }; resource_manager .previous_depth_pyramids - .insert(view_entity, depth_pyramid_all_mips.clone()); + .insert(view_entity, depth_pyramid.all_mips.clone()); let material_depth = TextureDescriptor { label: Some("meshlet_material_depth"), @@ -563,9 +514,7 @@ pub fn prepare_meshlet_per_frame_resources( visibility_buffer_software_raster_indirect_args_second, visibility_buffer_hardware_raster_indirect_args_first, visibility_buffer_hardware_raster_indirect_args_second, - depth_pyramid_all_mips, - depth_pyramid_mips, - depth_pyramid_mip_count, + depth_pyramid, previous_depth_pyramid, material_depth: not_shadow_view .then(|| texture_cache.get(&render_device, material_depth)), @@ -676,7 +625,7 @@ pub fn prepare_meshlet_view_bind_groups( resource_manager .visibility_buffer_raster_clusters .as_entire_binding(), - &view_resources.depth_pyramid_all_mips, + &view_resources.depth_pyramid.all_mips, view_uniforms.clone(), previous_view_uniforms.clone(), )); @@ -686,25 +635,12 @@ pub fn prepare_meshlet_view_bind_groups( &entries, ); - let downsample_depth = render_device.create_bind_group( + let downsample_depth = view_resources.depth_pyramid.create_bind_group( + &render_device, "meshlet_downsample_depth_bind_group", &resource_manager.downsample_depth_bind_group_layout, - &BindGroupEntries::sequential(( - view_resources.visibility_buffer.as_entire_binding(), - &view_resources.depth_pyramid_mips[0], - &view_resources.depth_pyramid_mips[1], - &view_resources.depth_pyramid_mips[2], - &view_resources.depth_pyramid_mips[3], - &view_resources.depth_pyramid_mips[4], - &view_resources.depth_pyramid_mips[5], - &view_resources.depth_pyramid_mips[6], - &view_resources.depth_pyramid_mips[7], - &view_resources.depth_pyramid_mips[8], - &view_resources.depth_pyramid_mips[9], - &view_resources.depth_pyramid_mips[10], - &view_resources.depth_pyramid_mips[11], - &resource_manager.depth_pyramid_sampler, - )), + view_resources.visibility_buffer.as_entire_binding(), + &resource_manager.depth_pyramid_sampler, ); let entries = BindGroupEntries::sequential(( diff --git a/crates/bevy_pbr/src/meshlet/visibility_buffer_raster_node.rs b/crates/bevy_pbr/src/meshlet/visibility_buffer_raster_node.rs index 9890deb4db..c4376d622a 100644 --- a/crates/bevy_pbr/src/meshlet/visibility_buffer_raster_node.rs +++ b/crates/bevy_pbr/src/meshlet/visibility_buffer_raster_node.rs @@ -149,10 +149,11 @@ impl Node for MeshletVisibilityBufferRasterPassNode { Some(camera), meshlet_view_resources.raster_cluster_rightmost_slot, ); - downsample_depth( + meshlet_view_resources.depth_pyramid.downsample_depth( + "meshlet early downsample depth", render_context, - meshlet_view_resources, - meshlet_view_bind_groups, + meshlet_view_resources.view_size, + &meshlet_view_bind_groups.downsample_depth, downsample_depth_first_pipeline, downsample_depth_second_pipeline, ); @@ -200,10 +201,11 @@ impl Node for MeshletVisibilityBufferRasterPassNode { resolve_material_depth_pipeline, camera, ); - downsample_depth( + meshlet_view_resources.depth_pyramid.downsample_depth( + "meshlet late downsample depth", render_context, - meshlet_view_resources, - meshlet_view_bind_groups, + meshlet_view_resources.view_size, + &meshlet_view_bind_groups.downsample_depth, downsample_depth_first_pipeline, downsample_depth_second_pipeline, ); @@ -267,10 +269,11 @@ impl Node for MeshletVisibilityBufferRasterPassNode { None, meshlet_view_resources.raster_cluster_rightmost_slot, ); - downsample_depth( + meshlet_view_resources.depth_pyramid.downsample_depth( + "meshlet early shadow downsample depth", render_context, - meshlet_view_resources, - meshlet_view_bind_groups, + meshlet_view_resources.view_size, + &meshlet_view_bind_groups.downsample_depth, downsample_depth_first_shadow_view_pipeline, downsample_depth_second_shadow_view_pipeline, ); @@ -311,10 +314,11 @@ impl Node for MeshletVisibilityBufferRasterPassNode { resolve_depth_shadow_view_pipeline, camera, ); - downsample_depth( + meshlet_view_resources.depth_pyramid.downsample_depth( + "meshlet late shadow downsample depth", render_context, - meshlet_view_resources, - meshlet_view_bind_groups, + meshlet_view_resources.view_size, + &meshlet_view_bind_groups.downsample_depth, downsample_depth_first_shadow_view_pipeline, downsample_depth_second_shadow_view_pipeline, ); @@ -471,39 +475,6 @@ fn raster_pass( hardware_pass.draw_indirect(visibility_buffer_hardware_raster_indirect_args, 0); } -fn downsample_depth( - render_context: &mut RenderContext, - meshlet_view_resources: &MeshletViewResources, - meshlet_view_bind_groups: &MeshletViewBindGroups, - downsample_depth_first_pipeline: &ComputePipeline, - downsample_depth_second_pipeline: &ComputePipeline, -) { - let command_encoder = render_context.command_encoder(); - let mut downsample_pass = command_encoder.begin_compute_pass(&ComputePassDescriptor { - label: Some("downsample_depth"), - timestamp_writes: None, - }); - downsample_pass.set_pipeline(downsample_depth_first_pipeline); - downsample_pass.set_push_constants( - 0, - bytemuck::cast_slice(&[ - meshlet_view_resources.depth_pyramid_mip_count, - meshlet_view_resources.view_size.x, - ]), - ); - downsample_pass.set_bind_group(0, &meshlet_view_bind_groups.downsample_depth, &[]); - downsample_pass.dispatch_workgroups( - meshlet_view_resources.view_size.x.div_ceil(64), - meshlet_view_resources.view_size.y.div_ceil(64), - 1, - ); - - if meshlet_view_resources.depth_pyramid_mip_count >= 7 { - downsample_pass.set_pipeline(downsample_depth_second_pipeline); - downsample_pass.dispatch_workgroups(1, 1, 1); - } -} - fn resolve_depth( render_context: &mut RenderContext, depth_stencil_attachment: RenderPassDepthStencilAttachment, diff --git a/crates/bevy_pbr/src/prepass/mod.rs b/crates/bevy_pbr/src/prepass/mod.rs index 9c9b800d06..3bc2dc51e3 100644 --- a/crates/bevy_pbr/src/prepass/mod.rs +++ b/crates/bevy_pbr/src/prepass/mod.rs @@ -6,7 +6,7 @@ use crate::{ setup_morph_and_skinning_defs, skin, DrawMesh, Material, MaterialPipeline, MaterialPipelineKey, MeshLayouts, MeshPipeline, MeshPipelineKey, OpaqueRendererMethod, PreparedMaterial, RenderLightmaps, RenderMaterialInstances, RenderMeshInstanceFlags, RenderMeshInstances, - SetMaterialBindGroup, SetMeshBindGroup, StandardMaterial, + SetMaterialBindGroup, SetMeshBindGroup, ShadowView, StandardMaterial, }; use bevy_app::{App, Plugin, PreUpdate}; use bevy_render::{ @@ -52,8 +52,6 @@ use crate::meshlet::{ prepare_material_meshlet_meshes_prepass, queue_material_meshlet_meshes, InstanceManager, MeshletMesh3d, }; -#[cfg(feature = "meshlet")] -use crate::ShadowView; use bevy_render::view::RenderVisibleEntities; use core::{hash::Hash, marker::PhantomData}; @@ -214,20 +212,16 @@ where #[derive(Resource)] struct AnyPrepassPluginLoaded; -#[cfg(not(feature = "meshlet"))] -type PreviousViewFilter = (With, With); -#[cfg(feature = "meshlet")] -type PreviousViewFilter = Or<(With, With)>; - pub fn update_previous_view_data( mut commands: Commands, - query: Query<(Entity, &Camera, &GlobalTransform), PreviousViewFilter>, + query: Query<(Entity, &Camera, &GlobalTransform), Or<(With, With)>>, ) { for (entity, camera, camera_transform) in &query { let view_from_world = camera_transform.compute_matrix().inverse(); commands.entity(entity).try_insert(PreviousViewData { view_from_world, clip_from_world: camera.clip_from_view() * view_from_world, + clip_from_view: camera.clip_from_view(), }); } } @@ -242,7 +236,7 @@ type PreviousMeshFilter = Or<(With, With)>; pub fn update_mesh_previous_global_transforms( mut commands: Commands, - views: Query<&Camera, PreviousViewFilter>, + views: Query<&Camera, Or<(With, With)>>, meshes: Query<(Entity, &GlobalTransform), PreviousMeshFilter>, ) { let should_run = views.iter().any(|camera| camera.is_active); @@ -698,7 +692,10 @@ pub fn prepare_previous_view_uniforms( render_device: Res, render_queue: Res, mut previous_view_uniforms: ResMut, - views: Query<(Entity, &ExtractedView, Option<&PreviousViewData>), PreviousViewFilter>, + views: Query< + (Entity, &ExtractedView, Option<&PreviousViewData>), + Or<(With, With)>, + >, ) { let views_iter = views.iter(); let view_count = views_iter.len(); @@ -718,6 +715,7 @@ pub fn prepare_previous_view_uniforms( PreviousViewData { view_from_world, clip_from_world: camera.clip_from_view * view_from_world, + clip_from_view: camera.clip_from_view, } } }; diff --git a/crates/bevy_pbr/src/prepass/prepass_bindings.wgsl b/crates/bevy_pbr/src/prepass/prepass_bindings.wgsl index a8dae83b8e..3bd27b2e03 100644 --- a/crates/bevy_pbr/src/prepass/prepass_bindings.wgsl +++ b/crates/bevy_pbr/src/prepass/prepass_bindings.wgsl @@ -3,10 +3,9 @@ struct PreviousViewUniforms { view_from_world: mat4x4, clip_from_world: mat4x4, + clip_from_view: mat4x4, } -#ifdef MOTION_VECTOR_PREPASS @group(0) @binding(2) var previous_view_uniforms: PreviousViewUniforms; -#endif // MOTION_VECTOR_PREPASS // Material bindings will be in @group(2) diff --git a/crates/bevy_pbr/src/render/build_indirect_params.wgsl b/crates/bevy_pbr/src/render/build_indirect_params.wgsl index 90741e9064..17152caab0 100644 --- a/crates/bevy_pbr/src/render/build_indirect_params.wgsl +++ b/crates/bevy_pbr/src/render/build_indirect_params.wgsl @@ -59,24 +59,47 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { let mesh_index = indirect_parameters_metadata[instance_index].mesh_index; let base_output_index = indirect_parameters_metadata[instance_index].base_output_index; let batch_set_index = indirect_parameters_metadata[instance_index].batch_set_index; - let instance_count = atomicLoad(&indirect_parameters_metadata[instance_index].instance_count); // If we aren't using `multi_draw_indirect_count`, we have a 1:1 fixed // assignment of batches to slots in the indirect parameters buffer, so we // can just use the instance index as the index of our indirect parameters. + let early_instance_count = + atomicLoad(&indirect_parameters_metadata[instance_index].early_instance_count); + let late_instance_count = + atomicLoad(&indirect_parameters_metadata[instance_index].late_instance_count); + + // If in the early phase, we draw only the early meshes. If in the late + // phase, we draw only the late meshes. If in the main phase, draw all the + // meshes. +#ifdef EARLY_PHASE + let instance_count = early_instance_count; +#else // EARLY_PHASE +#ifdef LATE_PHASE + let instance_count = late_instance_count; +#else // LATE_PHASE + let instance_count = early_instance_count + late_instance_count; +#endif // LATE_PHASE +#endif // EARLY_PHASE + var indirect_parameters_index = instance_index; // If the current hardware and driver support `multi_draw_indirect_count`, // dynamically reserve an index for the indirect parameters we're to // generate. #ifdef MULTI_DRAW_INDIRECT_COUNT_SUPPORTED - if (instance_count == 0u) { - return; - } - // If this batch belongs to a batch set, then allocate space for the // indirect commands in that batch set. if (batch_set_index != 0xffffffffu) { + // Bail out now if there are no instances. Note that we can only bail if + // we're in a batch set. That's because only batch sets are drawn using + // `multi_draw_indirect_count`. If we aren't using + // `multi_draw_indirect_count`, then we need to continue in order to + // zero out the instance count; otherwise, it'll have garbage data in + // it. + if (instance_count == 0u) { + return; + } + let indirect_parameters_base = indirect_batch_sets[batch_set_index].indirect_parameters_base; let indirect_parameters_offset = @@ -90,7 +113,16 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { // non-indexed meshes are slightly different. indirect_parameters[indirect_parameters_index].instance_count = instance_count; + +#ifdef LATE_PHASE + // The late mesh instances are stored after the early mesh instances, so we + // offset the output index by the number of early mesh instances. + indirect_parameters[indirect_parameters_index].first_instance = + base_output_index + early_instance_count; +#else // LATE_PHASE indirect_parameters[indirect_parameters_index].first_instance = base_output_index; +#endif // LATE_PHASE + indirect_parameters[indirect_parameters_index].base_vertex = current_input[mesh_index].first_vertex_index; diff --git a/crates/bevy_pbr/src/render/gpu_preprocess.rs b/crates/bevy_pbr/src/render/gpu_preprocess.rs index dac3c68068..dfc8739ca4 100644 --- a/crates/bevy_pbr/src/render/gpu_preprocess.rs +++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs @@ -10,30 +10,41 @@ use core::num::{NonZero, NonZeroU64}; use bevy_app::{App, Plugin}; use bevy_asset::{load_internal_asset, Handle}; -use bevy_core_pipeline::core_3d::graph::{Core3d, Node3d}; +use bevy_core_pipeline::{ + core_3d::graph::{Core3d, Node3d}, + experimental::mip_generation::ViewDepthPyramid, + prepass::{ + DeferredPrepass, DepthPrepass, PreviousViewData, PreviousViewUniformOffset, + PreviousViewUniforms, + }, +}; use bevy_derive::{Deref, DerefMut}; use bevy_ecs::{ component::Component, entity::Entity, - query::{Has, QueryState, Without}, + prelude::resource_exists, + query::{Has, QueryState, With, Without}, resource::Resource, - schedule::{common_conditions::resource_exists, IntoSystemConfigs as _}, - system::{lifetimeless::Read, Commands, Res, ResMut}, + schedule::IntoSystemConfigs as _, + system::{lifetimeless::Read, Commands, Query, Res, ResMut}, world::{FromWorld, World}, }; use bevy_render::{ batching::gpu_preprocessing::{ - BatchedInstanceBuffers, GpuPreprocessingSupport, IndirectBatchSet, - IndirectParametersBuffers, IndirectParametersIndexed, IndirectParametersMetadata, - IndirectParametersNonIndexed, PreprocessWorkItem, PreprocessWorkItemBuffers, + BatchedInstanceBuffers, GpuOcclusionCullingWorkItemBuffers, GpuPreprocessingSupport, + IndirectBatchSet, IndirectParametersBuffers, IndirectParametersIndexed, + IndirectParametersMetadata, IndirectParametersNonIndexed, + LatePreprocessWorkItemIndirectParameters, PreprocessWorkItem, PreprocessWorkItemBuffers, }, + experimental::occlusion_culling::OcclusionCulling, render_graph::{Node, NodeRunError, RenderGraphApp, RenderGraphContext}, render_resource::{ - binding_types::{storage_buffer, storage_buffer_read_only, uniform_buffer}, + binding_types::{storage_buffer, storage_buffer_read_only, texture_2d, uniform_buffer}, BindGroup, BindGroupEntries, BindGroupLayout, BindingResource, Buffer, BufferBinding, - CachedComputePipelineId, ComputePassDescriptor, ComputePipelineDescriptor, - DynamicBindGroupLayoutEntries, PipelineCache, Shader, ShaderStages, ShaderType, - SpecializedComputePipeline, SpecializedComputePipelines, + BufferVec, CachedComputePipelineId, ComputePassDescriptor, ComputePipelineDescriptor, + DynamicBindGroupLayoutEntries, PipelineCache, PushConstantRange, RawBufferVec, Shader, + ShaderStages, ShaderType, SpecializedComputePipeline, SpecializedComputePipelines, + TextureSampleType, UninitBufferVec, }, renderer::{RenderContext, RenderDevice, RenderQueue}, settings::WgpuFeatures, @@ -57,6 +68,9 @@ pub const MESH_PREPROCESS_SHADER_HANDLE: Handle = /// The handle to the `mesh_preprocess_types.wgsl` compute shader. pub const MESH_PREPROCESS_TYPES_SHADER_HANDLE: Handle = Handle::weak_from_u128(2720440370122465935); +/// The handle to the `reset_indirect_batch_sets.wgsl` compute shader. +pub const RESET_INDIRECT_BATCH_SETS_SHADER_HANDLE: Handle = + Handle::weak_from_u128(2602194133710559644); /// The handle to the `build_indirect_params.wgsl` compute shader. pub const BUILD_INDIRECT_PARAMS_SHADER_HANDLE: Handle = Handle::weak_from_u128(3711077208359699672); @@ -76,32 +90,96 @@ pub struct GpuMeshPreprocessPlugin { pub use_gpu_instance_buffer_builder: bool, } -/// The render node for the mesh preprocessing pass. +/// The render node for the first mesh preprocessing pass. /// -/// This pass runs a compute shader to cull invisible meshes (if that wasn't -/// done by the CPU), transforms them, and, if indirect drawing is on, populates -/// indirect draw parameter metadata for the subsequent -/// [`BuildIndirectParametersNode`]. -pub struct GpuPreprocessNode { +/// This pass runs a compute shader to cull meshes outside the view frustum (if +/// that wasn't done by the CPU), cull meshes that weren't visible last frame +/// (if occlusion culling is on), transform them, and, if indirect drawing is +/// on, populate indirect draw parameter metadata for the subsequent +/// [`EarlyPrepassBuildIndirectParametersNode`]. +pub struct EarlyGpuPreprocessNode { view_query: QueryState< ( Entity, - Read, - Read, + Option>, + Option>, Has, + Has, ), Without, >, main_view_query: QueryState>, } -/// The render node for the indirect parameter building pass. +/// The render node for the second mesh preprocessing pass. /// -/// This node runs a compute shader on the output of the [`GpuPreprocessNode`] -/// in order to transform the [`IndirectParametersMetadata`] into -/// properly-formatted [`IndirectParametersIndexed`] and -/// [`IndirectParametersNonIndexed`]. -pub struct BuildIndirectParametersNode { +/// This pass runs a compute shader to cull meshes outside the view frustum (if +/// that wasn't done by the CPU), cull meshes that were neither visible last +/// frame nor visible this frame (if occlusion culling is on), transform them, +/// and, if indirect drawing is on, populate the indirect draw parameter +/// metadata for the subsequent [`LatePrepassBuildIndirectParametersNode`]. +pub struct LateGpuPreprocessNode { + view_query: QueryState< + (Entity, Read, Read), + ( + Without, + Without, + With, + With, + Without, + ), + >, +} + +/// The render node for the part of the indirect parameter building pass that +/// draws the meshes visible from the previous frame. +/// +/// This node runs a compute shader on the output of the +/// [`EarlyGpuPreprocessNode`] in order to transform the +/// [`IndirectParametersMetadata`] into properly-formatted +/// [`IndirectParametersIndexed`] and [`IndirectParametersNonIndexed`]. +pub struct EarlyPrepassBuildIndirectParametersNode { + view_query: QueryState< + Read, + ( + Without, + Without, + With, + Without, + ), + >, +} + +/// The render node for the part of the indirect parameter building pass that +/// draws the meshes that are potentially visible on this frame but weren't +/// visible on the previous frame. +/// +/// This node runs a compute shader on the output of the +/// [`LateGpuPreprocessNode`] in order to transform the +/// [`IndirectParametersMetadata`] into properly-formatted +/// [`IndirectParametersIndexed`] and [`IndirectParametersNonIndexed`]. +pub struct LatePrepassBuildIndirectParametersNode { + view_query: QueryState< + Read, + ( + Without, + Without, + With, + With, + Without, + ), + >, +} + +/// The render node for the part of the indirect parameter building pass that +/// draws all meshes, both those that are newly-visible on this frame and those +/// that were visible last frame. +/// +/// This node runs a compute shader on the output of the +/// [`EarlyGpuPreprocessNode`] and [`LateGpuPreprocessNode`] in order to +/// transform the [`IndirectParametersMetadata`] into properly-formatted +/// [`IndirectParametersIndexed`] and [`IndirectParametersNonIndexed`]. +pub struct MainBuildIndirectParametersNode { view_query: QueryState< Read, (Without, Without), @@ -115,19 +193,57 @@ pub struct PreprocessPipelines { /// The pipeline used for CPU culling. This pipeline doesn't populate /// indirect parameter metadata. pub direct_preprocess: PreprocessPipeline, - /// The pipeline used for GPU culling. This pipeline populates indirect - /// parameter metadata. - pub gpu_culling_preprocess: PreprocessPipeline, + /// The pipeline used for mesh preprocessing when GPU frustum culling is in + /// use, but occlusion culling isn't. + /// + /// This pipeline populates indirect parameter metadata. + pub gpu_frustum_culling_preprocess: PreprocessPipeline, + /// The pipeline used for the first phase of occlusion culling. + /// + /// This pipeline culls, transforms meshes, and populates indirect parameter + /// metadata. + pub early_gpu_occlusion_culling_preprocess: PreprocessPipeline, + /// The pipeline used for the second phase of occlusion culling. + /// + /// This pipeline culls, transforms meshes, and populates indirect parameter + /// metadata. + pub late_gpu_occlusion_culling_preprocess: PreprocessPipeline, + /// The pipeline that builds indirect draw parameters for indexed meshes, + /// when frustum culling is enabled but occlusion culling *isn't* enabled. + pub gpu_frustum_culling_build_indexed_indirect_params: BuildIndirectParametersPipeline, + /// The pipeline that builds indirect draw parameters for non-indexed + /// meshes, when frustum culling is enabled but occlusion culling *isn't* + /// enabled. + pub gpu_frustum_culling_build_non_indexed_indirect_params: BuildIndirectParametersPipeline, + /// Compute shader pipelines for the early prepass phase that draws meshes + /// visible in the previous frame. + pub early_phase: PreprocessPhasePipelines, + /// Compute shader pipelines for the late prepass phase that draws meshes + /// that weren't visible in the previous frame, but became visible this + /// frame. + pub late_phase: PreprocessPhasePipelines, + /// Compute shader pipelines for the main color phase. + pub main_phase: PreprocessPhasePipelines, +} + +/// Compute shader pipelines for a specific phase: early, late, or main. +/// +/// The distinction between these phases is relevant for occlusion culling. +#[derive(Clone)] +pub struct PreprocessPhasePipelines { + /// The pipeline that resets the indirect draw counts used in + /// `multi_draw_indirect_count` to 0 in preparation for a new pass. + pub reset_indirect_batch_sets: ResetIndirectBatchSetsPipeline, /// The pipeline used for indexed indirect parameter building. /// /// This pipeline converts indirect parameter metadata into indexed indirect /// parameters. - pub build_indexed_indirect_params: BuildIndirectParametersPipeline, + pub gpu_occlusion_culling_build_indexed_indirect_params: BuildIndirectParametersPipeline, /// The pipeline used for non-indexed indirect parameter building. /// /// This pipeline converts indirect parameter metadata into non-indexed /// indirect parameters. - pub build_non_indexed_indirect_params: BuildIndirectParametersPipeline, + pub gpu_occlusion_culling_build_non_indexed_indirect_params: BuildIndirectParametersPipeline, } /// The pipeline for the GPU mesh preprocessing shader. @@ -140,7 +256,22 @@ pub struct PreprocessPipeline { pub pipeline_id: Option, } +/// The pipeline for the batch set count reset shader. +/// +/// This shader resets the indirect batch set count to 0 for each view. It runs +/// in between every phase (early, late, and main). +#[derive(Clone)] +pub struct ResetIndirectBatchSetsPipeline { + /// The bind group layout for the compute shader. + pub bind_group_layout: BindGroupLayout, + /// The pipeline ID for the compute shader. + /// + /// This gets filled in `prepare_preprocess_pipelines`. + pub pipeline_id: Option, +} + /// The pipeline for the indirect parameter building shader. +#[derive(Clone)] pub struct BuildIndirectParametersPipeline { /// The bind group layout for the compute shader. pub bind_group_layout: BindGroupLayout, @@ -154,10 +285,18 @@ bitflags! { /// Specifies variants of the mesh preprocessing shader. #[derive(Clone, Copy, PartialEq, Eq, Hash)] pub struct PreprocessPipelineKey: u8 { - /// Whether GPU culling is in use. + /// Whether GPU frustum culling is in use. /// - /// This `#define`'s `GPU_CULLING` in the shader. - const GPU_CULLING = 1; + /// This `#define`'s `FRUSTUM_CULLING` in the shader. + const FRUSTUM_CULLING = 1; + /// Whether GPU two-phase occlusion culling is in use. + /// + /// This `#define`'s `OCCLUSION_CULLING` in the shader. + const OCCLUSION_CULLING = 2; + /// Whether this is the early phase of GPU two-phase occlusion culling. + /// + /// This `#define`'s `EARLY_PHASE` in the shader. + const EARLY_PHASE = 4; } /// Specifies variants of the indirect parameter building shader. @@ -172,6 +311,24 @@ bitflags! { /// /// This defines `MULTI_DRAW_INDIRECT_COUNT_SUPPORTED` in the shader. const MULTI_DRAW_INDIRECT_COUNT_SUPPORTED = 2; + /// Whether GPU two-phase occlusion culling is in use. + /// + /// This `#define`'s `OCCLUSION_CULLING` in the shader. + const OCCLUSION_CULLING = 4; + /// Whether this is the early phase of GPU two-phase occlusion culling. + /// + /// This `#define`'s `EARLY_PHASE` in the shader. + const EARLY_PHASE = 8; + /// Whether this is the late phase of GPU two-phase occlusion culling. + /// + /// This `#define`'s `LATE_PHASE` in the shader. + const LATE_PHASE = 16; + /// Whether this is the phase that runs after the early and late phases, + /// and right before the main drawing logic, when GPU two-phase + /// occlusion culling is in use. + /// + /// This `#define`'s `MAIN_PHASE` in the shader. + const MAIN_PHASE = 32; } } @@ -196,35 +353,56 @@ pub enum PhasePreprocessBindGroups { Direct(BindGroup), /// The bind groups used for the compute shader when indirect drawing is - /// being used. + /// being used, but occlusion culling isn't being used. /// /// Because indirect drawing requires splitting the meshes into indexed and /// non-indexed meshes, there are two bind groups here. - Indirect { - /// The bind group used for indexed meshes. - /// - /// This will be `None` if there are no indexed meshes. + IndirectFrustumCulling { + /// The bind group for indexed meshes. indexed: Option, - /// The bind group used for non-indexed meshes. - /// - /// This will be `None` if there are no non-indexed meshes. + /// The bind group for non-indexed meshes. non_indexed: Option, }, + + /// The bind groups used for the compute shader when indirect drawing is + /// being used, but occlusion culling isn't being used. + /// + /// Because indirect drawing requires splitting the meshes into indexed and + /// non-indexed meshes, and because occlusion culling requires splitting + /// this phase into early and late versions, there are four bind groups + /// here. + IndirectOcclusionCulling { + /// The bind group for indexed meshes during the early mesh + /// preprocessing phase. + early_indexed: Option, + /// The bind group for non-indexed meshes during the early mesh + /// preprocessing phase. + early_non_indexed: Option, + /// The bind group for indexed meshes during the late mesh preprocessing + /// phase. + late_indexed: Option, + /// The bind group for non-indexed meshes during the late mesh + /// preprocessing phase. + late_non_indexed: Option, + }, } -/// The bind groups for the indirect parameters building compute shader. -/// -/// This is shared among all views and phases. +/// The bind groups for the compute shaders that reset indirect draw counts and +/// build indirect parameters. #[derive(Resource)] pub struct BuildIndirectParametersBindGroups { - /// The bind group used for indexed meshes. - /// - /// This will be `None` if there are no indexed meshes. - indexed: Option, - /// The bind group used for non-indexed meshes. - /// - /// This will be `None` if there are no non-indexed meshes. - non_indexed: Option, + /// The bind group for the `reset_indirect_batch_sets.wgsl` shader, for + /// indexed meshes. + reset_indexed_indirect_batch_sets: Option, + /// The bind group for the `reset_indirect_batch_sets.wgsl` shader, for + /// non-indexed meshes. + reset_non_indexed_indirect_batch_sets: Option, + /// The bind group for the `build_indirect_params.wgsl` shader, for indexed + /// meshes. + build_indexed_indirect: Option, + /// The bind group for the `build_indirect_params.wgsl` shader, for + /// non-indexed meshes. + build_non_indexed_indirect: Option, } /// Stops the `GpuPreprocessNode` attempting to generate the buffer for this view @@ -242,8 +420,20 @@ impl Plugin for GpuMeshPreprocessPlugin { ); load_internal_asset!( app, - MESH_PREPROCESS_TYPES_SHADER_HANDLE, - "mesh_preprocess_types.wgsl", + RESET_INDIRECT_BATCH_SETS_SHADER_HANDLE, + "reset_indirect_batch_sets.wgsl", + Shader::from_wgsl + ); + load_internal_asset!( + app, + BUILD_INDIRECT_PARAMS_SHADER_HANDLE, + "build_indirect_params.wgsl", + Shader::from_wgsl + ); + load_internal_asset!( + app, + BUILD_INDIRECT_PARAMS_SHADER_HANDLE, + "build_indirect_params.wgsl", Shader::from_wgsl ); load_internal_asset!( @@ -269,6 +459,7 @@ impl Plugin for GpuMeshPreprocessPlugin { render_app .init_resource::() .init_resource::>() + .init_resource::>() .init_resource::>() .add_systems( Render, @@ -280,25 +471,48 @@ impl Plugin for GpuMeshPreprocessPlugin { ) .in_set(RenderSet::PrepareBindGroups), write_mesh_culling_data_buffer.in_set(RenderSet::PrepareResourcesFlush), - ) + ), ) - .add_render_graph_node::(Core3d, NodePbr::GpuPreprocess) - .add_render_graph_node::( + .add_render_graph_node::(Core3d, NodePbr::EarlyGpuPreprocess) + .add_render_graph_node::(Core3d, NodePbr::LateGpuPreprocess) + .add_render_graph_node::( Core3d, - NodePbr::BuildIndirectParameters + NodePbr::EarlyPrepassBuildIndirectParameters, + ) + .add_render_graph_node::( + Core3d, + NodePbr::LatePrepassBuildIndirectParameters, + ) + .add_render_graph_node::( + Core3d, + NodePbr::MainBuildIndirectParameters, ) .add_render_graph_edges( Core3d, - (NodePbr::GpuPreprocess, NodePbr::BuildIndirectParameters, Node3d::Prepass) + ( + NodePbr::EarlyGpuPreprocess, + NodePbr::EarlyPrepassBuildIndirectParameters, + Node3d::EarlyPrepass, + Node3d::EarlyDownsampleDepth, + NodePbr::LateGpuPreprocess, + NodePbr::LatePrepassBuildIndirectParameters, + Node3d::LatePrepass, + NodePbr::MainBuildIndirectParameters, + // Shadows don't currently support occlusion culling, so we + // treat shadows as effectively the main phase for our + // purposes. + NodePbr::ShadowPass, + ), ) - .add_render_graph_edges( + .add_render_graph_edge( Core3d, - (NodePbr::GpuPreprocess, NodePbr::BuildIndirectParameters, NodePbr::ShadowPass) + NodePbr::MainBuildIndirectParameters, + Node3d::DeferredPrepass ); } } -impl FromWorld for GpuPreprocessNode { +impl FromWorld for EarlyGpuPreprocessNode { fn from_world(world: &mut World) -> Self { Self { view_query: QueryState::new(world), @@ -307,7 +521,7 @@ impl FromWorld for GpuPreprocessNode { } } -impl Node for GpuPreprocessNode { +impl Node for EarlyGpuPreprocessNode { fn update(&mut self, world: &mut World) { self.view_query.update_archetypes(world); self.main_view_query.update_archetypes(world); @@ -332,7 +546,7 @@ impl Node for GpuPreprocessNode { render_context .command_encoder() .begin_compute_pass(&ComputePassDescriptor { - label: Some("mesh preprocessing"), + label: Some("early mesh preprocessing"), timestamp_writes: None, }); @@ -347,24 +561,42 @@ impl Node for GpuPreprocessNode { // Run the compute passes. for view_entity in all_views { - let Ok((view, bind_groups, view_uniform_offset, no_indirect_drawing)) = - self.view_query.get_manual(world, view_entity) + let Ok(( + view, + bind_groups, + view_uniform_offset, + no_indirect_drawing, + occlusion_culling, + )) = self.view_query.get_manual(world, view_entity) else { continue; }; + let Some(bind_groups) = bind_groups else { + continue; + }; + let Some(view_uniform_offset) = view_uniform_offset else { + continue; + }; + // Grab the work item buffers for this view. - let Some(view_work_item_buffers) = index_buffers.get(&view) else { + let Some(phase_work_item_buffers) = index_buffers.get(&view) else { warn!("The preprocessing index buffer wasn't present"); continue; }; // Select the right pipeline, depending on whether GPU culling is in // use. - let maybe_pipeline_id = if !no_indirect_drawing { - preprocess_pipelines.gpu_culling_preprocess.pipeline_id - } else { + let maybe_pipeline_id = if no_indirect_drawing { preprocess_pipelines.direct_preprocess.pipeline_id + } else if occlusion_culling { + preprocess_pipelines + .early_gpu_occlusion_culling_preprocess + .pipeline_id + } else { + preprocess_pipelines + .gpu_frustum_culling_preprocess + .pipeline_id }; // Fetch the pipeline. @@ -383,7 +615,7 @@ impl Node for GpuPreprocessNode { compute_pass.set_pipeline(preprocess_pipeline); // Loop over each render phase. - for (phase_type_id, phase_work_item_buffers) in view_work_item_buffers { + for (phase_type_id, work_item_buffers) in phase_work_item_buffers { // Fetch the bind group for the render phase. let Some(phase_bind_groups) = bind_groups.get(phase_type_id) else { continue; @@ -401,21 +633,25 @@ impl Node for GpuPreprocessNode { PhasePreprocessBindGroups::Direct(ref bind_group) => { // Invoke the mesh preprocessing shader to transform // meshes only, but not cull. - let PreprocessWorkItemBuffers::Direct(phase_work_item_buffer) = - phase_work_item_buffers + let PreprocessWorkItemBuffers::Direct(work_item_buffer) = work_item_buffers else { continue; }; compute_pass.set_bind_group(0, bind_group, &dynamic_offsets); - let workgroup_count = phase_work_item_buffer.len().div_ceil(WORKGROUP_SIZE); + let workgroup_count = work_item_buffer.len().div_ceil(WORKGROUP_SIZE); if workgroup_count > 0 { compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1); } } - PhasePreprocessBindGroups::Indirect { + PhasePreprocessBindGroups::IndirectFrustumCulling { indexed: ref maybe_indexed_bind_group, non_indexed: ref maybe_non_indexed_bind_group, + } + | PhasePreprocessBindGroups::IndirectOcclusionCulling { + early_indexed: ref maybe_indexed_bind_group, + early_non_indexed: ref maybe_non_indexed_bind_group, + .. } => { // Invoke the mesh preprocessing shader to transform and // cull the meshes. @@ -423,13 +659,28 @@ impl Node for GpuPreprocessNode { indexed: indexed_buffer, non_indexed: non_indexed_buffer, .. - } = phase_work_item_buffers + } = work_item_buffers else { continue; }; // Transform and cull indexed meshes if there are any. if let Some(indexed_bind_group) = maybe_indexed_bind_group { + if let PreprocessWorkItemBuffers::Indirect { + gpu_occlusion_culling: + Some(GpuOcclusionCullingWorkItemBuffers { + late_indirect_parameters_indexed_offset, + .. + }), + .. + } = *work_item_buffers + { + compute_pass.set_push_constants( + 0, + bytemuck::bytes_of(&late_indirect_parameters_indexed_offset), + ); + } + compute_pass.set_bind_group(0, indexed_bind_group, &dynamic_offsets); let workgroup_count = indexed_buffer.len().div_ceil(WORKGROUP_SIZE); if workgroup_count > 0 { @@ -439,6 +690,23 @@ impl Node for GpuPreprocessNode { // Transform and cull non-indexed meshes if there are any. if let Some(non_indexed_bind_group) = maybe_non_indexed_bind_group { + if let PreprocessWorkItemBuffers::Indirect { + gpu_occlusion_culling: + Some(GpuOcclusionCullingWorkItemBuffers { + late_indirect_parameters_non_indexed_offset, + .. + }), + .. + } = *work_item_buffers + { + compute_pass.set_push_constants( + 0, + bytemuck::bytes_of( + &late_indirect_parameters_non_indexed_offset, + ), + ); + } + compute_pass.set_bind_group( 0, non_indexed_bind_group, @@ -458,7 +726,7 @@ impl Node for GpuPreprocessNode { } } -impl FromWorld for BuildIndirectParametersNode { +impl FromWorld for EarlyPrepassBuildIndirectParametersNode { fn from_world(world: &mut World) -> Self { Self { view_query: QueryState::new(world), @@ -466,7 +734,31 @@ impl FromWorld for BuildIndirectParametersNode { } } -impl Node for BuildIndirectParametersNode { +impl FromWorld for LatePrepassBuildIndirectParametersNode { + fn from_world(world: &mut World) -> Self { + Self { + view_query: QueryState::new(world), + } + } +} + +impl FromWorld for MainBuildIndirectParametersNode { + fn from_world(world: &mut World) -> Self { + Self { + view_query: QueryState::new(world), + } + } +} + +impl FromWorld for LateGpuPreprocessNode { + fn from_world(world: &mut World) -> Self { + Self { + view_query: QueryState::new(world), + } + } +} + +impl Node for LateGpuPreprocessNode { fn update(&mut self, world: &mut World) { self.view_query.update_archetypes(world); } @@ -477,88 +769,117 @@ impl Node for BuildIndirectParametersNode { render_context: &mut RenderContext<'w>, world: &'w World, ) -> Result<(), NodeRunError> { - // Fetch the bind group. - let Some(build_indirect_params_bind_groups) = - world.get_resource::() - else { - return Ok(()); - }; + // Grab the [`BatchedInstanceBuffers`]. + let BatchedInstanceBuffers { + ref work_item_buffers, + ref late_indexed_indirect_parameters_buffer, + ref late_non_indexed_indirect_parameters_buffer, + .. + } = world.resource::>(); - // Fetch the pipelines and the buffers we need. let pipeline_cache = world.resource::(); let preprocess_pipelines = world.resource::(); - let indirect_parameters_buffers = world.resource::(); - // Create the compute pass. let mut compute_pass = render_context .command_encoder() .begin_compute_pass(&ComputePassDescriptor { - label: Some("build indirect parameters"), + label: Some("late mesh preprocessing"), timestamp_writes: None, }); - // Fetch the pipelines. + // Run the compute passes. + for (view, bind_groups, view_uniform_offset) in self.view_query.iter_manual(world) { + // Grab the work item buffers for this view. + let Some(phase_work_item_buffers) = work_item_buffers.get(&view) else { + warn!("The preprocessing index buffer wasn't present"); + continue; + }; - let (maybe_indexed_pipeline_id, maybe_non_indexed_pipeline_id) = ( - preprocess_pipelines - .build_indexed_indirect_params - .pipeline_id, - preprocess_pipelines - .build_non_indexed_indirect_params - .pipeline_id, - ); + let maybe_pipeline_id = preprocess_pipelines + .late_gpu_occlusion_culling_preprocess + .pipeline_id; - let ( - Some(build_indexed_indirect_params_pipeline_id), - Some(build_non_indexed_indirect_params_pipeline_id), - ) = (maybe_indexed_pipeline_id, maybe_non_indexed_pipeline_id) - else { - warn!("The build indirect parameters pipelines weren't ready"); - return Ok(()); - }; + // Fetch the pipeline. + let Some(preprocess_pipeline_id) = maybe_pipeline_id else { + warn!("The build mesh uniforms pipeline wasn't ready"); + return Ok(()); + }; - let ( - Some(build_indexed_indirect_params_pipeline), - Some(build_non_indexed_indirect_params_pipeline), - ) = ( - pipeline_cache.get_compute_pipeline(build_indexed_indirect_params_pipeline_id), - pipeline_cache.get_compute_pipeline(build_non_indexed_indirect_params_pipeline_id), - ) - else { - // This will happen while the pipeline is being compiled and is fine. - return Ok(()); - }; + let Some(preprocess_pipeline) = + pipeline_cache.get_compute_pipeline(preprocess_pipeline_id) + else { + // This will happen while the pipeline is being compiled and is fine. + return Ok(()); + }; - // Transform the [`IndirectParametersMetadata`] that the GPU mesh - // preprocessing phase wrote to [`IndirectParametersIndexed`] for - // indexed meshes, if we have any. - if let Some(ref build_indirect_indexed_params_bind_group) = - build_indirect_params_bind_groups.indexed - { - compute_pass.set_pipeline(build_indexed_indirect_params_pipeline); - compute_pass.set_bind_group(0, build_indirect_indexed_params_bind_group, &[]); - let workgroup_count = indirect_parameters_buffers - .indexed_batch_count() - .div_ceil(WORKGROUP_SIZE); - if workgroup_count > 0 { - compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1); - } - } + compute_pass.set_pipeline(preprocess_pipeline); - // Transform the [`IndirectParametersMetadata`] that the GPU mesh - // preprocessing phase wrote to [`IndirectParametersNonIndexed`] for - // non-indexed meshes, if we have any. - if let Some(ref build_indirect_non_indexed_params_bind_group) = - build_indirect_params_bind_groups.non_indexed - { - compute_pass.set_pipeline(build_non_indexed_indirect_params_pipeline); - compute_pass.set_bind_group(0, build_indirect_non_indexed_params_bind_group, &[]); - let workgroup_count = indirect_parameters_buffers - .non_indexed_batch_count() - .div_ceil(WORKGROUP_SIZE); - if workgroup_count > 0 { - compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1); + for (phase_type_id, work_item_buffers) in phase_work_item_buffers { + let ( + PreprocessWorkItemBuffers::Indirect { + gpu_occlusion_culling: + Some(GpuOcclusionCullingWorkItemBuffers { + late_indirect_parameters_indexed_offset, + late_indirect_parameters_non_indexed_offset, + .. + }), + .. + }, + Some(PhasePreprocessBindGroups::IndirectOcclusionCulling { + late_indexed: ref maybe_late_indexed_bind_group, + late_non_indexed: ref maybe_late_non_indexed_bind_group, + .. + }), + Some(late_indexed_indirect_parameters_buffer), + Some(late_non_indexed_indirect_parameters_buffer), + ) = ( + work_item_buffers, + bind_groups.get(phase_type_id), + late_indexed_indirect_parameters_buffer.buffer(), + late_non_indexed_indirect_parameters_buffer.buffer(), + ) + else { + continue; + }; + + let mut dynamic_offsets: SmallVec<[u32; 1]> = smallvec![]; + dynamic_offsets.push(view_uniform_offset.offset); + + // If there's no space reserved for work items, then don't + // bother doing the dispatch, as there can't possibly be any + // meshes of the given class (indexed or non-indexed) in this + // phase. + + // Transform and cull indexed meshes if there are any. + if let Some(late_indexed_bind_group) = maybe_late_indexed_bind_group { + compute_pass.set_push_constants( + 0, + bytemuck::bytes_of(late_indirect_parameters_indexed_offset), + ); + + compute_pass.set_bind_group(0, late_indexed_bind_group, &dynamic_offsets); + compute_pass.dispatch_workgroups_indirect( + late_indexed_indirect_parameters_buffer, + (*late_indirect_parameters_indexed_offset as u64) + * (size_of::() as u64), + ); + } + + // Transform and cull non-indexed meshes if there are any. + if let Some(late_non_indexed_bind_group) = maybe_late_non_indexed_bind_group { + compute_pass.set_push_constants( + 0, + bytemuck::bytes_of(late_indirect_parameters_non_indexed_offset), + ); + + compute_pass.set_bind_group(0, late_non_indexed_bind_group, &dynamic_offsets); + compute_pass.dispatch_workgroups_indirect( + late_non_indexed_indirect_parameters_buffer, + (*late_indirect_parameters_non_indexed_offset as u64) + * (size_of::() as u64), + ); + } } } @@ -566,15 +887,233 @@ impl Node for BuildIndirectParametersNode { } } +impl Node for EarlyPrepassBuildIndirectParametersNode { + fn update(&mut self, world: &mut World) { + self.view_query.update_archetypes(world); + } + + fn run<'w>( + &self, + _: &mut RenderGraphContext, + render_context: &mut RenderContext<'w>, + world: &'w World, + ) -> Result<(), NodeRunError> { + let preprocess_pipelines = world.resource::(); + + // If there are no views with a depth prepass enabled, we don't need to + // run this. + if self.view_query.iter_manual(world).next().is_none() { + return Ok(()); + } + + run_build_indirect_parameters_node( + render_context, + world, + &preprocess_pipelines.early_phase, + "early prepass indirect parameters building", + ) + } +} + +impl Node for LatePrepassBuildIndirectParametersNode { + fn update(&mut self, world: &mut World) { + self.view_query.update_archetypes(world); + } + + fn run<'w>( + &self, + _: &mut RenderGraphContext, + render_context: &mut RenderContext<'w>, + world: &'w World, + ) -> Result<(), NodeRunError> { + let preprocess_pipelines = world.resource::(); + + // If there are no views with occlusion culling enabled, we don't need + // to run this. + if self.view_query.iter_manual(world).next().is_none() { + return Ok(()); + } + + run_build_indirect_parameters_node( + render_context, + world, + &preprocess_pipelines.late_phase, + "late prepass indirect parameters building", + ) + } +} + +impl Node for MainBuildIndirectParametersNode { + fn update(&mut self, world: &mut World) { + self.view_query.update_archetypes(world); + } + + fn run<'w>( + &self, + _: &mut RenderGraphContext, + render_context: &mut RenderContext<'w>, + world: &'w World, + ) -> Result<(), NodeRunError> { + let preprocess_pipelines = world.resource::(); + + run_build_indirect_parameters_node( + render_context, + world, + &preprocess_pipelines.main_phase, + "main indirect parameters building", + ) + } +} + +fn run_build_indirect_parameters_node( + render_context: &mut RenderContext, + world: &World, + preprocess_phase_pipelines: &PreprocessPhasePipelines, + label: &'static str, +) -> Result<(), NodeRunError> { + let Some(build_indirect_params_bind_groups) = + world.get_resource::() + else { + return Ok(()); + }; + + let pipeline_cache = world.resource::(); + let indirect_parameters_buffers = world.resource::(); + + let mut compute_pass = + render_context + .command_encoder() + .begin_compute_pass(&ComputePassDescriptor { + label: Some(label), + timestamp_writes: None, + }); + + // Fetch the pipeline. + let ( + Some(reset_indirect_batch_sets_pipeline_id), + Some(build_indexed_indirect_params_pipeline_id), + Some(build_non_indexed_indirect_params_pipeline_id), + ) = ( + preprocess_phase_pipelines + .reset_indirect_batch_sets + .pipeline_id, + preprocess_phase_pipelines + .gpu_occlusion_culling_build_indexed_indirect_params + .pipeline_id, + preprocess_phase_pipelines + .gpu_occlusion_culling_build_non_indexed_indirect_params + .pipeline_id, + ) + else { + warn!("The build indirect parameters pipelines weren't ready"); + return Ok(()); + }; + + let ( + Some(reset_indirect_batch_sets_pipeline), + Some(build_indexed_indirect_params_pipeline), + Some(build_non_indexed_indirect_params_pipeline), + ) = ( + pipeline_cache.get_compute_pipeline(reset_indirect_batch_sets_pipeline_id), + pipeline_cache.get_compute_pipeline(build_indexed_indirect_params_pipeline_id), + pipeline_cache.get_compute_pipeline(build_non_indexed_indirect_params_pipeline_id), + ) + else { + // This will happen while the pipeline is being compiled and is fine. + return Ok(()); + }; + + // Build indexed indirect parameters. + if let ( + Some(reset_indexed_indirect_batch_sets_bind_group), + Some(build_indirect_indexed_params_bind_group), + ) = ( + &build_indirect_params_bind_groups.reset_indexed_indirect_batch_sets, + &build_indirect_params_bind_groups.build_indexed_indirect, + ) { + compute_pass.set_pipeline(reset_indirect_batch_sets_pipeline); + compute_pass.set_bind_group(0, reset_indexed_indirect_batch_sets_bind_group, &[]); + let workgroup_count = indirect_parameters_buffers + .batch_set_count(true) + .div_ceil(WORKGROUP_SIZE); + if workgroup_count > 0 { + compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1); + } + + compute_pass.set_pipeline(build_indexed_indirect_params_pipeline); + compute_pass.set_bind_group(0, build_indirect_indexed_params_bind_group, &[]); + let workgroup_count = indirect_parameters_buffers + .indexed_batch_count() + .div_ceil(WORKGROUP_SIZE); + if workgroup_count > 0 { + compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1); + } + } + + // Build non-indexed indirect parameters. + if let ( + Some(reset_non_indexed_indirect_batch_sets_bind_group), + Some(build_indirect_non_indexed_params_bind_group), + ) = ( + &build_indirect_params_bind_groups.reset_non_indexed_indirect_batch_sets, + &build_indirect_params_bind_groups.build_non_indexed_indirect, + ) { + compute_pass.set_pipeline(reset_indirect_batch_sets_pipeline); + compute_pass.set_bind_group(0, reset_non_indexed_indirect_batch_sets_bind_group, &[]); + let workgroup_count = indirect_parameters_buffers + .batch_set_count(false) + .div_ceil(WORKGROUP_SIZE); + if workgroup_count > 0 { + compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1); + } + + compute_pass.set_pipeline(build_non_indexed_indirect_params_pipeline); + compute_pass.set_bind_group(0, build_indirect_non_indexed_params_bind_group, &[]); + let workgroup_count = indirect_parameters_buffers + .non_indexed_batch_count() + .div_ceil(WORKGROUP_SIZE); + if workgroup_count > 0 { + compute_pass.dispatch_workgroups(workgroup_count as u32, 1, 1); + } + } + + Ok(()) +} + impl PreprocessPipelines { /// Returns true if the preprocessing and indirect parameters pipelines have /// been loaded or false otherwise. pub(crate) fn pipelines_are_loaded(&self, pipeline_cache: &PipelineCache) -> bool { self.direct_preprocess.is_loaded(pipeline_cache) - && self.gpu_culling_preprocess.is_loaded(pipeline_cache) - && self.build_indexed_indirect_params.is_loaded(pipeline_cache) && self - .build_non_indexed_indirect_params + .gpu_frustum_culling_preprocess + .is_loaded(pipeline_cache) + && self + .early_gpu_occlusion_culling_preprocess + .is_loaded(pipeline_cache) + && self + .late_gpu_occlusion_culling_preprocess + .is_loaded(pipeline_cache) + && self + .gpu_frustum_culling_build_indexed_indirect_params + .is_loaded(pipeline_cache) + && self + .gpu_frustum_culling_build_non_indexed_indirect_params + .is_loaded(pipeline_cache) + && self.early_phase.is_loaded(pipeline_cache) + && self.late_phase.is_loaded(pipeline_cache) + && self.main_phase.is_loaded(pipeline_cache) + } +} + +impl PreprocessPhasePipelines { + fn is_loaded(&self, pipeline_cache: &PipelineCache) -> bool { + self.reset_indirect_batch_sets.is_loaded(pipeline_cache) + && self + .gpu_occlusion_culling_build_indexed_indirect_params + .is_loaded(pipeline_cache) + && self + .gpu_occlusion_culling_build_non_indexed_indirect_params .is_loaded(pipeline_cache) } } @@ -586,6 +1125,13 @@ impl PreprocessPipeline { } } +impl ResetIndirectBatchSetsPipeline { + fn is_loaded(&self, pipeline_cache: &PipelineCache) -> bool { + self.pipeline_id + .is_some_and(|pipeline_id| pipeline_cache.get_compute_pipeline(pipeline_id).is_some()) + } +} + impl BuildIndirectParametersPipeline { /// Returns true if this pipeline has been loaded into the pipeline cache or /// false otherwise. @@ -600,17 +1146,32 @@ impl SpecializedComputePipeline for PreprocessPipeline { fn specialize(&self, key: Self::Key) -> ComputePipelineDescriptor { let mut shader_defs = vec![]; - if key.contains(PreprocessPipelineKey::GPU_CULLING) { + if key.contains(PreprocessPipelineKey::FRUSTUM_CULLING) { shader_defs.push("INDIRECT".into()); shader_defs.push("FRUSTUM_CULLING".into()); } + if key.contains(PreprocessPipelineKey::OCCLUSION_CULLING) { + shader_defs.push("OCCLUSION_CULLING".into()); + if key.contains(PreprocessPipelineKey::EARLY_PHASE) { + shader_defs.push("EARLY_PHASE".into()); + } else { + shader_defs.push("LATE_PHASE".into()); + } + } ComputePipelineDescriptor { label: Some( format!( "mesh preprocessing ({})", - if key.contains(PreprocessPipelineKey::GPU_CULLING) { - "GPU culling" + if key.contains( + PreprocessPipelineKey::OCCLUSION_CULLING + | PreprocessPipelineKey::EARLY_PHASE + ) { + "early GPU occlusion culling" + } else if key.contains(PreprocessPipelineKey::OCCLUSION_CULLING) { + "late GPU occlusion culling" + } else if key.contains(PreprocessPipelineKey::FRUSTUM_CULLING) { + "GPU frustum culling" } else { "direct" } @@ -618,7 +1179,14 @@ impl SpecializedComputePipeline for PreprocessPipeline { .into(), ), layout: vec![self.bind_group_layout.clone()], - push_constant_ranges: vec![], + push_constant_ranges: if key.contains(PreprocessPipelineKey::OCCLUSION_CULLING) { + vec![PushConstantRange { + stages: ShaderStages::COMPUTE, + range: 0..4, + }] + } else { + vec![] + }, shader: MESH_PREPROCESS_SHADER_HANDLE, shader_defs, entry_point: "main".into(), @@ -634,15 +1202,20 @@ impl FromWorld for PreprocessPipelines { // GPU culling bind group parameters are a superset of those in the CPU // culling (direct) shader. let direct_bind_group_layout_entries = preprocess_direct_bind_group_layout_entries(); - let gpu_culling_bind_group_layout_entries = preprocess_direct_bind_group_layout_entries() - .extend_sequential(( - // `indirect_parameters_metadata` - storage_buffer::(/* has_dynamic_offset= */ false), - // `mesh_culling_data` - storage_buffer_read_only::(/* has_dynamic_offset= */ false), - // `view` - uniform_buffer::(/* has_dynamic_offset= */ true), - )); + let gpu_frustum_culling_bind_group_layout_entries = gpu_culling_bind_group_layout_entries(); + let gpu_early_occlusion_culling_bind_group_layout_entries = + gpu_occlusion_culling_bind_group_layout_entries().extend_with_indices((( + 11, + storage_buffer::(/*has_dynamic_offset=*/ false), + ),)); + let gpu_late_occlusion_culling_bind_group_layout_entries = + gpu_occlusion_culling_bind_group_layout_entries(); + + let reset_indirect_batch_sets_bind_group_layout_entries = + DynamicBindGroupLayoutEntries::sequential( + ShaderStages::COMPUTE, + (storage_buffer::(false),), + ); // Indexed and non-indexed bind group parameters share all the bind // group layout entries except the final one. @@ -658,9 +1231,21 @@ impl FromWorld for PreprocessPipelines { "build mesh uniforms direct bind group layout", &direct_bind_group_layout_entries, ); - let gpu_culling_bind_group_layout = render_device.create_bind_group_layout( - "build mesh uniforms GPU culling bind group layout", - &gpu_culling_bind_group_layout_entries, + let gpu_frustum_culling_bind_group_layout = render_device.create_bind_group_layout( + "build mesh uniforms GPU frustum culling bind group layout", + &gpu_frustum_culling_bind_group_layout_entries, + ); + let gpu_early_occlusion_culling_bind_group_layout = render_device.create_bind_group_layout( + "build mesh uniforms GPU early occlusion culling bind group layout", + &gpu_early_occlusion_culling_bind_group_layout_entries, + ); + let gpu_late_occlusion_culling_bind_group_layout = render_device.create_bind_group_layout( + "build mesh uniforms GPU late occlusion culling bind group layout", + &gpu_late_occlusion_culling_bind_group_layout_entries, + ); + let reset_indirect_batch_sets_bind_group_layout = render_device.create_bind_group_layout( + "reset indirect batch sets bind group layout", + &reset_indirect_batch_sets_bind_group_layout_entries, ); let build_indexed_indirect_params_bind_group_layout = render_device .create_bind_group_layout( @@ -673,39 +1258,67 @@ impl FromWorld for PreprocessPipelines { &build_non_indexed_indirect_params_bind_group_layout_entries, ); + let preprocess_phase_pipelines = PreprocessPhasePipelines { + reset_indirect_batch_sets: ResetIndirectBatchSetsPipeline { + bind_group_layout: reset_indirect_batch_sets_bind_group_layout.clone(), + pipeline_id: None, + }, + gpu_occlusion_culling_build_indexed_indirect_params: BuildIndirectParametersPipeline { + bind_group_layout: build_indexed_indirect_params_bind_group_layout.clone(), + pipeline_id: None, + }, + gpu_occlusion_culling_build_non_indexed_indirect_params: + BuildIndirectParametersPipeline { + bind_group_layout: build_non_indexed_indirect_params_bind_group_layout.clone(), + pipeline_id: None, + }, + }; + PreprocessPipelines { direct_preprocess: PreprocessPipeline { bind_group_layout: direct_bind_group_layout, pipeline_id: None, }, - gpu_culling_preprocess: PreprocessPipeline { - bind_group_layout: gpu_culling_bind_group_layout, + gpu_frustum_culling_preprocess: PreprocessPipeline { + bind_group_layout: gpu_frustum_culling_bind_group_layout, pipeline_id: None, }, - build_indexed_indirect_params: BuildIndirectParametersPipeline { - bind_group_layout: build_indexed_indirect_params_bind_group_layout, + early_gpu_occlusion_culling_preprocess: PreprocessPipeline { + bind_group_layout: gpu_early_occlusion_culling_bind_group_layout, pipeline_id: None, }, - build_non_indexed_indirect_params: BuildIndirectParametersPipeline { - bind_group_layout: build_non_indexed_indirect_params_bind_group_layout, + late_gpu_occlusion_culling_preprocess: PreprocessPipeline { + bind_group_layout: gpu_late_occlusion_culling_bind_group_layout, pipeline_id: None, }, + gpu_frustum_culling_build_indexed_indirect_params: BuildIndirectParametersPipeline { + bind_group_layout: build_indexed_indirect_params_bind_group_layout.clone(), + pipeline_id: None, + }, + gpu_frustum_culling_build_non_indexed_indirect_params: + BuildIndirectParametersPipeline { + bind_group_layout: build_non_indexed_indirect_params_bind_group_layout.clone(), + pipeline_id: None, + }, + early_phase: preprocess_phase_pipelines.clone(), + late_phase: preprocess_phase_pipelines.clone(), + main_phase: preprocess_phase_pipelines.clone(), } } } fn preprocess_direct_bind_group_layout_entries() -> DynamicBindGroupLayoutEntries { - DynamicBindGroupLayoutEntries::sequential( + DynamicBindGroupLayoutEntries::new_with_indices( ShaderStages::COMPUTE, ( // `current_input` - storage_buffer_read_only::(false), + (3, storage_buffer_read_only::(false)), // `previous_input` - storage_buffer_read_only::(false), + (4, storage_buffer_read_only::(false)), // `indices` - storage_buffer_read_only::(false), + (5, storage_buffer_read_only::(false)), // `output` - storage_buffer::(false), + (6, storage_buffer::(false)), ), ) } @@ -713,37 +1326,103 @@ fn preprocess_direct_bind_group_layout_entries() -> DynamicBindGroupLayoutEntrie // Returns the first 3 bind group layout entries shared between all invocations // of the indirect parameters building shader. fn build_indirect_params_bind_group_layout_entries() -> DynamicBindGroupLayoutEntries { - DynamicBindGroupLayoutEntries::sequential( + DynamicBindGroupLayoutEntries::new_with_indices( ShaderStages::COMPUTE, ( - storage_buffer_read_only::(false), - storage_buffer_read_only::(false), - storage_buffer::(false), + (0, storage_buffer_read_only::(false)), + ( + 1, + storage_buffer_read_only::(false), + ), + (2, storage_buffer::(false)), ), ) } /// A system that specializes the `mesh_preprocess.wgsl` and /// `build_indirect_params.wgsl` pipelines if necessary. +fn gpu_culling_bind_group_layout_entries() -> DynamicBindGroupLayoutEntries { + // GPU culling bind group parameters are a superset of those in the CPU + // culling (direct) shader. + preprocess_direct_bind_group_layout_entries().extend_with_indices(( + // `indirect_parameters` + ( + 7, + storage_buffer::(/* has_dynamic_offset= */ false), + ), + // `mesh_culling_data` + ( + 8, + storage_buffer_read_only::(/* has_dynamic_offset= */ false), + ), + // `view` + ( + 0, + uniform_buffer::(/* has_dynamic_offset= */ true), + ), + )) +} + +fn gpu_occlusion_culling_bind_group_layout_entries() -> DynamicBindGroupLayoutEntries { + gpu_culling_bind_group_layout_entries().extend_with_indices(( + ( + 2, + uniform_buffer::(/*has_dynamic_offset=*/ false), + ), + ( + 10, + texture_2d(TextureSampleType::Float { filterable: true }), + ), + ( + 12, + storage_buffer::( + /*has_dynamic_offset=*/ false, + ), + ), + )) +} + +/// A system that specializes the `mesh_preprocess.wgsl` pipelines if necessary. pub fn prepare_preprocess_pipelines( pipeline_cache: Res, render_device: Res, mut specialized_preprocess_pipelines: ResMut>, + mut specialized_reset_indirect_batch_sets_pipelines: ResMut< + SpecializedComputePipelines, + >, mut specialized_build_indirect_parameters_pipelines: ResMut< SpecializedComputePipelines, >, - mut preprocess_pipelines: ResMut, + preprocess_pipelines: ResMut, ) { + let preprocess_pipelines = preprocess_pipelines.into_inner(); + preprocess_pipelines.direct_preprocess.prepare( &pipeline_cache, &mut specialized_preprocess_pipelines, PreprocessPipelineKey::empty(), ); - preprocess_pipelines.gpu_culling_preprocess.prepare( + preprocess_pipelines.gpu_frustum_culling_preprocess.prepare( &pipeline_cache, &mut specialized_preprocess_pipelines, - PreprocessPipelineKey::GPU_CULLING, + PreprocessPipelineKey::FRUSTUM_CULLING, ); + preprocess_pipelines + .early_gpu_occlusion_culling_preprocess + .prepare( + &pipeline_cache, + &mut specialized_preprocess_pipelines, + PreprocessPipelineKey::FRUSTUM_CULLING + | PreprocessPipelineKey::OCCLUSION_CULLING + | PreprocessPipelineKey::EARLY_PHASE, + ); + preprocess_pipelines + .late_gpu_occlusion_culling_preprocess + .prepare( + &pipeline_cache, + &mut specialized_preprocess_pipelines, + PreprocessPipelineKey::FRUSTUM_CULLING | PreprocessPipelineKey::OCCLUSION_CULLING, + ); let mut build_indirect_parameters_pipeline_key = BuildIndirectParametersPipelineKey::empty(); @@ -758,18 +1437,61 @@ pub fn prepare_preprocess_pipelines( .insert(BuildIndirectParametersPipelineKey::MULTI_DRAW_INDIRECT_COUNT_SUPPORTED); } - preprocess_pipelines.build_indexed_indirect_params.prepare( - &pipeline_cache, - &mut specialized_build_indirect_parameters_pipelines, - build_indirect_parameters_pipeline_key | BuildIndirectParametersPipelineKey::INDEXED, - ); preprocess_pipelines - .build_non_indexed_indirect_params + .gpu_frustum_culling_build_indexed_indirect_params + .prepare( + &pipeline_cache, + &mut specialized_build_indirect_parameters_pipelines, + build_indirect_parameters_pipeline_key | BuildIndirectParametersPipelineKey::INDEXED, + ); + preprocess_pipelines + .gpu_frustum_culling_build_non_indexed_indirect_params .prepare( &pipeline_cache, &mut specialized_build_indirect_parameters_pipelines, build_indirect_parameters_pipeline_key, ); + + for (preprocess_phase_pipelines, build_indirect_parameters_phase_pipeline_key) in [ + ( + &mut preprocess_pipelines.early_phase, + BuildIndirectParametersPipelineKey::EARLY_PHASE, + ), + ( + &mut preprocess_pipelines.late_phase, + BuildIndirectParametersPipelineKey::LATE_PHASE, + ), + ( + &mut preprocess_pipelines.main_phase, + BuildIndirectParametersPipelineKey::MAIN_PHASE, + ), + ] { + preprocess_phase_pipelines + .reset_indirect_batch_sets + .prepare( + &pipeline_cache, + &mut specialized_reset_indirect_batch_sets_pipelines, + ); + preprocess_phase_pipelines + .gpu_occlusion_culling_build_indexed_indirect_params + .prepare( + &pipeline_cache, + &mut specialized_build_indirect_parameters_pipelines, + build_indirect_parameters_pipeline_key + | build_indirect_parameters_phase_pipeline_key + | BuildIndirectParametersPipelineKey::INDEXED + | BuildIndirectParametersPipelineKey::OCCLUSION_CULLING, + ); + preprocess_phase_pipelines + .gpu_occlusion_culling_build_non_indexed_indirect_params + .prepare( + &pipeline_cache, + &mut specialized_build_indirect_parameters_pipelines, + build_indirect_parameters_pipeline_key + | build_indirect_parameters_phase_pipeline_key + | BuildIndirectParametersPipelineKey::OCCLUSION_CULLING, + ); + } } impl PreprocessPipeline { @@ -788,6 +1510,22 @@ impl PreprocessPipeline { } } +impl SpecializedComputePipeline for ResetIndirectBatchSetsPipeline { + type Key = (); + + fn specialize(&self, _: Self::Key) -> ComputePipelineDescriptor { + ComputePipelineDescriptor { + label: Some("reset indirect batch sets".into()), + layout: vec![self.bind_group_layout.clone()], + push_constant_ranges: vec![], + shader: RESET_INDIRECT_BATCH_SETS_SHADER_HANDLE, + shader_defs: vec![], + entry_point: "main".into(), + zero_initialize_workgroup_memory: false, + } + } +} + impl SpecializedComputePipeline for BuildIndirectParametersPipeline { type Key = BuildIndirectParametersPipelineKey; @@ -799,13 +1537,39 @@ impl SpecializedComputePipeline for BuildIndirectParametersPipeline { if key.contains(BuildIndirectParametersPipelineKey::MULTI_DRAW_INDIRECT_COUNT_SUPPORTED) { shader_defs.push("MULTI_DRAW_INDIRECT_COUNT_SUPPORTED".into()); } + if key.contains(BuildIndirectParametersPipelineKey::OCCLUSION_CULLING) { + shader_defs.push("OCCLUSION_CULLING".into()); + } + if key.contains(BuildIndirectParametersPipelineKey::EARLY_PHASE) { + shader_defs.push("EARLY_PHASE".into()); + } + if key.contains(BuildIndirectParametersPipelineKey::LATE_PHASE) { + shader_defs.push("LATE_PHASE".into()); + } + if key.contains(BuildIndirectParametersPipelineKey::MAIN_PHASE) { + shader_defs.push("MAIN_PHASE".into()); + } + + let label = format!( + "{} build {}indexed indirect parameters", + if !key.contains(BuildIndirectParametersPipelineKey::OCCLUSION_CULLING) { + "frustum culling" + } else if key.contains(BuildIndirectParametersPipelineKey::EARLY_PHASE) { + "early occlusion culling" + } else if key.contains(BuildIndirectParametersPipelineKey::LATE_PHASE) { + "late occlusion culling" + } else { + "main occlusion culling" + }, + if key.contains(BuildIndirectParametersPipelineKey::INDEXED) { + "" + } else { + "non-" + } + ); ComputePipelineDescriptor { - label: if key.contains(BuildIndirectParametersPipelineKey::INDEXED) { - Some("build indexed indirect parameters".into()) - } else { - Some("build non-indexed indirect parameters".into()) - }, + label: Some(label.into()), layout: vec![self.bind_group_layout.clone()], push_constant_ranges: vec![], shader: BUILD_INDIRECT_PARAMS_SHADER_HANDLE, @@ -816,6 +1580,21 @@ impl SpecializedComputePipeline for BuildIndirectParametersPipeline { } } +impl ResetIndirectBatchSetsPipeline { + fn prepare( + &mut self, + pipeline_cache: &PipelineCache, + pipelines: &mut SpecializedComputePipelines, + ) { + if self.pipeline_id.is_some() { + return; + } + + let reset_indirect_batch_sets_pipeline_id = pipelines.specialize(pipeline_cache, self, ()); + self.pipeline_id = Some(reset_indirect_batch_sets_pipeline_id); + } +} + impl BuildIndirectParametersPipeline { fn prepare( &mut self, @@ -834,55 +1613,117 @@ impl BuildIndirectParametersPipeline { /// A system that attaches the mesh uniform buffers to the bind groups for the /// variants of the mesh preprocessing compute shader. +#[expect( + clippy::too_many_arguments, + reason = "it's a system that needs a lot of arguments" +)] pub fn prepare_preprocess_bind_groups( mut commands: Commands, + view_depth_pyramids: Query<(&ViewDepthPyramid, &PreviousViewUniformOffset)>, render_device: Res, batched_instance_buffers: Res>, indirect_parameters_buffers: Res, mesh_culling_data_buffer: Res, view_uniforms: Res, + previous_view_uniforms: Res, pipelines: Res, ) { // Grab the `BatchedInstanceBuffers`. - let batched_instance_buffers = batched_instance_buffers.into_inner(); + let BatchedInstanceBuffers { + data_buffer: ref data_buffer_vec, + ref work_item_buffers, + current_input_buffer: ref current_input_buffer_vec, + previous_input_buffer: ref previous_input_buffer_vec, + ref late_indexed_indirect_parameters_buffer, + ref late_non_indexed_indirect_parameters_buffer, + } = batched_instance_buffers.into_inner(); - let Some(current_input_buffer) = batched_instance_buffers - .current_input_buffer - .buffer() - .buffer() - else { + let (Some(current_input_buffer), Some(previous_input_buffer), Some(data_buffer)) = ( + current_input_buffer_vec.buffer().buffer(), + previous_input_buffer_vec.buffer().buffer(), + data_buffer_vec.buffer(), + ) else { return; }; - // Keep track of whether any of the phases will be drawn indirectly. If - // they are, then we'll need bind groups for the indirect parameters - // building shader too. + // Record whether we have any meshes that are to be drawn indirectly. If we + // don't, then we can skip building indirect parameters. let mut any_indirect = false; - for (view, phase_work_item_buffers) in &batched_instance_buffers.work_item_buffers { + // Loop over each view. + for (view, phase_work_item_buffers) in work_item_buffers { let mut bind_groups = TypeIdMap::default(); + // Loop over each phase. for (&phase_id, work_item_buffers) in phase_work_item_buffers { - if let Some(bind_group) = prepare_preprocess_bind_group_for_phase( - &render_device, - &pipelines, - &view_uniforms, - &indirect_parameters_buffers, - &mesh_culling_data_buffer, - batched_instance_buffers, - work_item_buffers, - &mut any_indirect, - ) { + // Create the `PreprocessBindGroupBuilder`. + let preprocess_bind_group_builder = PreprocessBindGroupBuilder { + view: *view, + late_indexed_indirect_parameters_buffer, + late_non_indexed_indirect_parameters_buffer, + render_device: &render_device, + indirect_parameters_buffers: &indirect_parameters_buffers, + mesh_culling_data_buffer: &mesh_culling_data_buffer, + view_uniforms: &view_uniforms, + previous_view_uniforms: &previous_view_uniforms, + pipelines: &pipelines, + current_input_buffer, + previous_input_buffer, + data_buffer, + }; + + // Depending on the type of work items we have, construct the + // appropriate bind groups. + let (was_indirect, bind_group) = match *work_item_buffers { + PreprocessWorkItemBuffers::Direct(ref work_item_buffer) => ( + false, + preprocess_bind_group_builder + .create_direct_preprocess_bind_groups(work_item_buffer), + ), + + PreprocessWorkItemBuffers::Indirect { + indexed: ref indexed_work_item_buffer, + non_indexed: ref non_indexed_work_item_buffer, + gpu_occlusion_culling: Some(ref gpu_occlusion_culling_work_item_buffers), + } => ( + true, + preprocess_bind_group_builder + .create_indirect_occlusion_culling_preprocess_bind_groups( + &view_depth_pyramids, + indexed_work_item_buffer, + non_indexed_work_item_buffer, + gpu_occlusion_culling_work_item_buffers, + ), + ), + + PreprocessWorkItemBuffers::Indirect { + indexed: ref indexed_work_item_buffer, + non_indexed: ref non_indexed_work_item_buffer, + gpu_occlusion_culling: None, + } => ( + true, + preprocess_bind_group_builder + .create_indirect_frustum_culling_preprocess_bind_groups( + indexed_work_item_buffer, + non_indexed_work_item_buffer, + ), + ), + }; + + // Write that bind group in. + if let Some(bind_group) = bind_group { + any_indirect = any_indirect || was_indirect; bind_groups.insert(phase_id, bind_group); } } + // Save the bind groups. commands .entity(*view) .insert(PreprocessBindGroups(bind_groups)); } - // If any of the phases will be drawn indirectly, create the bind groups for + // Now, if there were any indirect draw commands, create the bind groups for // the indirect parameters building shader. if any_indirect { create_build_indirect_parameters_bind_groups( @@ -895,164 +1736,599 @@ pub fn prepare_preprocess_bind_groups( } } -// Creates the bind group for the GPU preprocessing shader for a single phase -// for a single view. -#[expect( - clippy::too_many_arguments, - reason = "it's a system that needs a bunch of parameters" -)] -fn prepare_preprocess_bind_group_for_phase( - render_device: &RenderDevice, - pipelines: &PreprocessPipelines, - view_uniforms: &ViewUniforms, - indirect_parameters_buffers: &IndirectParametersBuffers, - mesh_culling_data_buffer: &MeshCullingDataBuffer, - batched_instance_buffers: &BatchedInstanceBuffers, - work_item_buffers: &PreprocessWorkItemBuffers, - any_indirect: &mut bool, -) -> Option { - // Get the current input buffers. +/// A temporary structure that stores all the information needed to construct +/// bind groups for the mesh preprocessing shader. +struct PreprocessBindGroupBuilder<'a> { + /// The render-world entity corresponding to the current view. + view: Entity, + /// The indirect compute dispatch parameters buffer for indexed meshes in + /// the late prepass. + late_indexed_indirect_parameters_buffer: + &'a RawBufferVec, + /// The indirect compute dispatch parameters buffer for non-indexed meshes + /// in the late prepass. + late_non_indexed_indirect_parameters_buffer: + &'a RawBufferVec, + /// The device. + render_device: &'a RenderDevice, + /// The buffers that store indirect draw parameters. + indirect_parameters_buffers: &'a IndirectParametersBuffers, + /// The GPU buffer that stores the information needed to cull each mesh. + mesh_culling_data_buffer: &'a MeshCullingDataBuffer, + /// The GPU buffer that stores information about the view. + view_uniforms: &'a ViewUniforms, + /// The GPU buffer that stores information about the view from last frame. + previous_view_uniforms: &'a PreviousViewUniforms, + /// The pipelines for the mesh preprocessing shader. + pipelines: &'a PreprocessPipelines, + /// The GPU buffer containing the list of [`MeshInputUniform`]s for the + /// current frame. + current_input_buffer: &'a Buffer, + /// The GPU buffer containing the list of [`MeshInputUniform`]s for the + /// previous frame. + previous_input_buffer: &'a Buffer, + /// The GPU buffer containing the list of [`MeshUniform`]s for the current + /// frame. + /// + /// This is the buffer containing the mesh's final transforms that the + /// shaders will write to. + data_buffer: &'a Buffer, +} - let BatchedInstanceBuffers { - data_buffer: ref data_buffer_vec, - current_input_buffer: ref current_input_buffer_vec, - previous_input_buffer: ref previous_input_buffer_vec, - .. - } = batched_instance_buffers; +impl<'a> PreprocessBindGroupBuilder<'a> { + /// Creates the bind groups for mesh preprocessing when GPU frustum culling + /// and GPU occlusion culling are both disabled. + fn create_direct_preprocess_bind_groups( + &self, + work_item_buffer: &BufferVec, + ) -> Option { + // Don't use `as_entire_binding()` here; the shader reads the array + // length and the underlying buffer may be longer than the actual size + // of the vector. + let work_item_buffer_size = NonZero::::try_from( + work_item_buffer.len() as u64 * u64::from(PreprocessWorkItem::min_size()), + ) + .ok(); - let current_input_buffer = current_input_buffer_vec.buffer().buffer()?; - let previous_input_buffer = previous_input_buffer_vec.buffer().buffer()?; - let data_buffer = data_buffer_vec.buffer()?; - - // Build the appropriate bind group, depending on whether we're drawing - // directly or indirectly. - - match *work_item_buffers { - PreprocessWorkItemBuffers::Direct(ref work_item_buffer_vec) => { - let work_item_buffer = work_item_buffer_vec.buffer()?; - - // Don't use `as_entire_binding()` here; the shader reads the array - // length and the underlying buffer may be longer than the actual size - // of the vector. - let work_item_buffer_size = NonZero::::try_from( - work_item_buffer_vec.len() as u64 * u64::from(PreprocessWorkItem::min_size()), - ) - .ok(); - - Some(PhasePreprocessBindGroups::Direct( - render_device.create_bind_group( - "preprocess_direct_bind_group", - &pipelines.direct_preprocess.bind_group_layout, - &BindGroupEntries::sequential(( - current_input_buffer.as_entire_binding(), - previous_input_buffer.as_entire_binding(), + Some(PhasePreprocessBindGroups::Direct( + self.render_device.create_bind_group( + "preprocess_direct_bind_group", + &self.pipelines.direct_preprocess.bind_group_layout, + &BindGroupEntries::with_indices(( + (3, self.current_input_buffer.as_entire_binding()), + (4, self.previous_input_buffer.as_entire_binding()), + ( + 5, BindingResource::Buffer(BufferBinding { - buffer: work_item_buffer, + buffer: work_item_buffer.buffer()?, offset: 0, size: work_item_buffer_size, }), - data_buffer.as_entire_binding(), - )), - ), - )) + ), + (6, self.data_buffer.as_entire_binding()), + )), + ), + )) + } + + /// Creates the bind groups for mesh preprocessing when GPU occlusion + /// culling is enabled. + fn create_indirect_occlusion_culling_preprocess_bind_groups( + &self, + view_depth_pyramids: &Query<(&ViewDepthPyramid, &PreviousViewUniformOffset)>, + indexed_work_item_buffer: &BufferVec, + non_indexed_work_item_buffer: &BufferVec, + gpu_occlusion_culling_work_item_buffers: &GpuOcclusionCullingWorkItemBuffers, + ) -> Option { + let GpuOcclusionCullingWorkItemBuffers { + late_indexed: ref late_indexed_work_item_buffer, + late_non_indexed: ref late_non_indexed_work_item_buffer, + .. + } = *gpu_occlusion_culling_work_item_buffers; + + let (view_depth_pyramid, previous_view_uniform_offset) = + view_depth_pyramids.get(self.view).ok()?; + + Some(PhasePreprocessBindGroups::IndirectOcclusionCulling { + early_indexed: self.create_indirect_occlusion_culling_early_indexed_bind_group( + view_depth_pyramid, + previous_view_uniform_offset, + indexed_work_item_buffer, + late_indexed_work_item_buffer, + ), + + early_non_indexed: self.create_indirect_occlusion_culling_early_non_indexed_bind_group( + view_depth_pyramid, + previous_view_uniform_offset, + non_indexed_work_item_buffer, + late_non_indexed_work_item_buffer, + ), + + late_indexed: self.create_indirect_occlusion_culling_late_indexed_bind_group( + view_depth_pyramid, + previous_view_uniform_offset, + late_indexed_work_item_buffer, + ), + + late_non_indexed: self.create_indirect_occlusion_culling_late_non_indexed_bind_group( + view_depth_pyramid, + previous_view_uniform_offset, + late_non_indexed_work_item_buffer, + ), + }) + } + + /// Creates the bind group for the first phase of mesh preprocessing of + /// indexed meshes when GPU occlusion culling is enabled. + fn create_indirect_occlusion_culling_early_indexed_bind_group( + &self, + view_depth_pyramid: &ViewDepthPyramid, + previous_view_uniform_offset: &PreviousViewUniformOffset, + indexed_work_item_buffer: &BufferVec, + late_indexed_work_item_buffer: &UninitBufferVec, + ) -> Option { + let mesh_culling_data_buffer = self.mesh_culling_data_buffer.buffer()?; + let view_uniforms_binding = self.view_uniforms.uniforms.binding()?; + let previous_view_buffer = self.previous_view_uniforms.uniforms.buffer()?; + + match ( + self.indirect_parameters_buffers.indexed_metadata_buffer(), + indexed_work_item_buffer.buffer(), + late_indexed_work_item_buffer.buffer(), + self.late_indexed_indirect_parameters_buffer.buffer(), + ) { + ( + Some(indexed_metadata_buffer), + Some(indexed_work_item_gpu_buffer), + Some(late_indexed_work_item_gpu_buffer), + Some(late_indexed_indirect_parameters_buffer), + ) => { + // Don't use `as_entire_binding()` here; the shader reads the array + // length and the underlying buffer may be longer than the actual size + // of the vector. + let indexed_work_item_buffer_size = NonZero::::try_from( + indexed_work_item_buffer.len() as u64 + * u64::from(PreprocessWorkItem::min_size()), + ) + .ok(); + + Some( + self.render_device.create_bind_group( + "preprocess_early_indexed_gpu_occlusion_culling_bind_group", + &self + .pipelines + .early_gpu_occlusion_culling_preprocess + .bind_group_layout, + &BindGroupEntries::with_indices(( + (3, self.current_input_buffer.as_entire_binding()), + (4, self.previous_input_buffer.as_entire_binding()), + ( + 5, + BindingResource::Buffer(BufferBinding { + buffer: indexed_work_item_gpu_buffer, + offset: 0, + size: indexed_work_item_buffer_size, + }), + ), + (6, self.data_buffer.as_entire_binding()), + (7, indexed_metadata_buffer.as_entire_binding()), + (8, mesh_culling_data_buffer.as_entire_binding()), + (0, view_uniforms_binding.clone()), + (10, &view_depth_pyramid.all_mips), + ( + 2, + BufferBinding { + buffer: previous_view_buffer, + offset: previous_view_uniform_offset.offset as u64, + size: NonZeroU64::new(size_of::() as u64), + }, + ), + ( + 11, + BufferBinding { + buffer: late_indexed_work_item_gpu_buffer, + offset: 0, + size: indexed_work_item_buffer_size, + }, + ), + ( + 12, + BufferBinding { + buffer: late_indexed_indirect_parameters_buffer, + offset: 0, + size: NonZeroU64::new( + late_indexed_indirect_parameters_buffer.size(), + ), + }, + ), + )), + ), + ) + } + _ => None, } + } - PreprocessWorkItemBuffers::Indirect { - indexed: ref indexed_buffer, - non_indexed: ref non_indexed_buffer, - } => { - // For indirect drawing, we need two separate bind groups, one for indexed meshes and one for non-indexed meshes. + /// Creates the bind group for the first phase of mesh preprocessing of + /// non-indexed meshes when GPU occlusion culling is enabled. + fn create_indirect_occlusion_culling_early_non_indexed_bind_group( + &self, + view_depth_pyramid: &ViewDepthPyramid, + previous_view_uniform_offset: &PreviousViewUniformOffset, + non_indexed_work_item_buffer: &BufferVec, + late_non_indexed_work_item_buffer: &UninitBufferVec, + ) -> Option { + let mesh_culling_data_buffer = self.mesh_culling_data_buffer.buffer()?; + let view_uniforms_binding = self.view_uniforms.uniforms.binding()?; + let previous_view_buffer = self.previous_view_uniforms.uniforms.buffer()?; - let mesh_culling_data_buffer = mesh_culling_data_buffer.buffer()?; - let view_uniforms_binding = view_uniforms.uniforms.binding()?; + match ( + self.indirect_parameters_buffers + .non_indexed_metadata_buffer(), + non_indexed_work_item_buffer.buffer(), + late_non_indexed_work_item_buffer.buffer(), + self.late_non_indexed_indirect_parameters_buffer.buffer(), + ) { + ( + Some(non_indexed_metadata_buffer), + Some(non_indexed_work_item_gpu_buffer), + Some(late_non_indexed_work_item_buffer), + Some(late_non_indexed_indirect_parameters_buffer), + ) => { + // Don't use `as_entire_binding()` here; the shader reads the array + // length and the underlying buffer may be longer than the actual size + // of the vector. + let non_indexed_work_item_buffer_size = NonZero::::try_from( + non_indexed_work_item_buffer.len() as u64 + * u64::from(PreprocessWorkItem::min_size()), + ) + .ok(); - let indexed_bind_group = match ( - indexed_buffer.buffer(), - indirect_parameters_buffers.indexed_metadata_buffer(), - ) { - ( - Some(indexed_work_item_buffer), - Some(indexed_indirect_parameters_metadata_buffer), - ) => { - // Don't use `as_entire_binding()` here; the shader reads the array - // length and the underlying buffer may be longer than the actual size - // of the vector. - let indexed_work_item_buffer_size = NonZero::::try_from( - indexed_buffer.len() as u64 * u64::from(PreprocessWorkItem::min_size()), - ) - .ok(); - Some(render_device.create_bind_group( - "preprocess_indexed_indirect_gpu_culling_bind_group", - &pipelines.gpu_culling_preprocess.bind_group_layout, - &BindGroupEntries::sequential(( - current_input_buffer.as_entire_binding(), - previous_input_buffer.as_entire_binding(), - BindingResource::Buffer(BufferBinding { - buffer: indexed_work_item_buffer, - offset: 0, - size: indexed_work_item_buffer_size, - }), - data_buffer.as_entire_binding(), - indexed_indirect_parameters_metadata_buffer.as_entire_binding(), - mesh_culling_data_buffer.as_entire_binding(), - view_uniforms_binding.clone(), + Some( + self.render_device.create_bind_group( + "preprocess_early_non_indexed_gpu_occlusion_culling_bind_group", + &self + .pipelines + .early_gpu_occlusion_culling_preprocess + .bind_group_layout, + &BindGroupEntries::with_indices(( + (3, self.current_input_buffer.as_entire_binding()), + (4, self.previous_input_buffer.as_entire_binding()), + ( + 5, + BindingResource::Buffer(BufferBinding { + buffer: non_indexed_work_item_gpu_buffer, + offset: 0, + size: non_indexed_work_item_buffer_size, + }), + ), + (6, self.data_buffer.as_entire_binding()), + (7, non_indexed_metadata_buffer.as_entire_binding()), + (8, mesh_culling_data_buffer.as_entire_binding()), + (0, view_uniforms_binding.clone()), + (10, &view_depth_pyramid.all_mips), + ( + 2, + BufferBinding { + buffer: previous_view_buffer, + offset: previous_view_uniform_offset.offset as u64, + size: NonZeroU64::new(size_of::() as u64), + }, + ), + ( + 11, + BufferBinding { + buffer: late_non_indexed_work_item_buffer, + offset: 0, + size: non_indexed_work_item_buffer_size, + }, + ), + ( + 12, + BufferBinding { + buffer: late_non_indexed_indirect_parameters_buffer, + offset: 0, + size: NonZeroU64::new( + late_non_indexed_indirect_parameters_buffer.size(), + ), + }, + ), )), - )) - } - _ => None, - }; + ), + ) + } + _ => None, + } + } - let non_indexed_bind_group = match ( - non_indexed_buffer.buffer(), - indirect_parameters_buffers.non_indexed_metadata_buffer(), - ) { - ( - Some(non_indexed_work_item_buffer), - Some(non_indexed_indirect_parameters_metadata_buffer), - ) => { - // Don't use `as_entire_binding()` here; the shader reads the array - // length and the underlying buffer may be longer than the actual size - // of the vector. - let non_indexed_work_item_buffer_size = NonZero::::try_from( - non_indexed_buffer.len() as u64 * u64::from(PreprocessWorkItem::min_size()), - ) - .ok(); - Some(render_device.create_bind_group( - "preprocess_non_indexed_indirect_gpu_culling_bind_group", - &pipelines.gpu_culling_preprocess.bind_group_layout, - &BindGroupEntries::sequential(( - current_input_buffer.as_entire_binding(), - previous_input_buffer.as_entire_binding(), - BindingResource::Buffer(BufferBinding { - buffer: non_indexed_work_item_buffer, - offset: 0, - size: non_indexed_work_item_buffer_size, - }), - data_buffer.as_entire_binding(), - non_indexed_indirect_parameters_metadata_buffer.as_entire_binding(), - mesh_culling_data_buffer.as_entire_binding(), - view_uniforms_binding, + /// Creates the bind group for the second phase of mesh preprocessing of + /// indexed meshes when GPU occlusion culling is enabled. + fn create_indirect_occlusion_culling_late_indexed_bind_group( + &self, + view_depth_pyramid: &ViewDepthPyramid, + previous_view_uniform_offset: &PreviousViewUniformOffset, + late_indexed_work_item_buffer: &UninitBufferVec, + ) -> Option { + let mesh_culling_data_buffer = self.mesh_culling_data_buffer.buffer()?; + let view_uniforms_binding = self.view_uniforms.uniforms.binding()?; + let previous_view_buffer = self.previous_view_uniforms.uniforms.buffer()?; + + match ( + self.indirect_parameters_buffers.indexed_metadata_buffer(), + late_indexed_work_item_buffer.buffer(), + self.late_indexed_indirect_parameters_buffer.buffer(), + ) { + ( + Some(indexed_metadata_buffer), + Some(late_indexed_work_item_gpu_buffer), + Some(late_indexed_indirect_parameters_buffer), + ) => { + // Don't use `as_entire_binding()` here; the shader reads the array + // length and the underlying buffer may be longer than the actual size + // of the vector. + let late_indexed_work_item_buffer_size = NonZero::::try_from( + late_indexed_work_item_buffer.len() as u64 + * u64::from(PreprocessWorkItem::min_size()), + ) + .ok(); + + Some( + self.render_device.create_bind_group( + "preprocess_late_indexed_gpu_occlusion_culling_bind_group", + &self + .pipelines + .late_gpu_occlusion_culling_preprocess + .bind_group_layout, + &BindGroupEntries::with_indices(( + (3, self.current_input_buffer.as_entire_binding()), + (4, self.previous_input_buffer.as_entire_binding()), + ( + 5, + BindingResource::Buffer(BufferBinding { + buffer: late_indexed_work_item_gpu_buffer, + offset: 0, + size: late_indexed_work_item_buffer_size, + }), + ), + (6, self.data_buffer.as_entire_binding()), + (7, indexed_metadata_buffer.as_entire_binding()), + (8, mesh_culling_data_buffer.as_entire_binding()), + (0, view_uniforms_binding.clone()), + (10, &view_depth_pyramid.all_mips), + ( + 2, + BufferBinding { + buffer: previous_view_buffer, + offset: previous_view_uniform_offset.offset as u64, + size: NonZeroU64::new(size_of::() as u64), + }, + ), + ( + 12, + BufferBinding { + buffer: late_indexed_indirect_parameters_buffer, + offset: 0, + size: NonZeroU64::new( + late_indexed_indirect_parameters_buffer.size(), + ), + }, + ), )), - )) - } - _ => None, - }; + ), + ) + } + _ => None, + } + } - // Note that we found phases that will be drawn indirectly so that - // we remember to build the bind groups for the indirect parameter - // building shader. - *any_indirect = true; + /// Creates the bind group for the second phase of mesh preprocessing of + /// non-indexed meshes when GPU occlusion culling is enabled. + fn create_indirect_occlusion_culling_late_non_indexed_bind_group( + &self, + view_depth_pyramid: &ViewDepthPyramid, + previous_view_uniform_offset: &PreviousViewUniformOffset, + late_non_indexed_work_item_buffer: &UninitBufferVec, + ) -> Option { + let mesh_culling_data_buffer = self.mesh_culling_data_buffer.buffer()?; + let view_uniforms_binding = self.view_uniforms.uniforms.binding()?; + let previous_view_buffer = self.previous_view_uniforms.uniforms.buffer()?; - Some(PhasePreprocessBindGroups::Indirect { - indexed: indexed_bind_group, - non_indexed: non_indexed_bind_group, - }) + match ( + self.indirect_parameters_buffers + .non_indexed_metadata_buffer(), + late_non_indexed_work_item_buffer.buffer(), + self.late_non_indexed_indirect_parameters_buffer.buffer(), + ) { + ( + Some(non_indexed_metadata_buffer), + Some(non_indexed_work_item_gpu_buffer), + Some(late_non_indexed_indirect_parameters_buffer), + ) => { + // Don't use `as_entire_binding()` here; the shader reads the array + // length and the underlying buffer may be longer than the actual size + // of the vector. + let non_indexed_work_item_buffer_size = NonZero::::try_from( + late_non_indexed_work_item_buffer.len() as u64 + * u64::from(PreprocessWorkItem::min_size()), + ) + .ok(); + + Some( + self.render_device.create_bind_group( + "preprocess_late_non_indexed_gpu_occlusion_culling_bind_group", + &self + .pipelines + .late_gpu_occlusion_culling_preprocess + .bind_group_layout, + &BindGroupEntries::with_indices(( + (3, self.current_input_buffer.as_entire_binding()), + (4, self.previous_input_buffer.as_entire_binding()), + ( + 5, + BindingResource::Buffer(BufferBinding { + buffer: non_indexed_work_item_gpu_buffer, + offset: 0, + size: non_indexed_work_item_buffer_size, + }), + ), + (6, self.data_buffer.as_entire_binding()), + (7, non_indexed_metadata_buffer.as_entire_binding()), + (8, mesh_culling_data_buffer.as_entire_binding()), + (0, view_uniforms_binding.clone()), + (10, &view_depth_pyramid.all_mips), + ( + 2, + BufferBinding { + buffer: previous_view_buffer, + offset: previous_view_uniform_offset.offset as u64, + size: NonZeroU64::new(size_of::() as u64), + }, + ), + ( + 12, + BufferBinding { + buffer: late_non_indexed_indirect_parameters_buffer, + offset: 0, + size: NonZeroU64::new( + late_non_indexed_indirect_parameters_buffer.size(), + ), + }, + ), + )), + ), + ) + } + _ => None, + } + } + + /// Creates the bind groups for mesh preprocessing when GPU frustum culling + /// is enabled, but GPU occlusion culling is disabled. + fn create_indirect_frustum_culling_preprocess_bind_groups( + &self, + indexed_work_item_buffer: &BufferVec, + non_indexed_work_item_buffer: &BufferVec, + ) -> Option { + Some(PhasePreprocessBindGroups::IndirectFrustumCulling { + indexed: self + .create_indirect_frustum_culling_indexed_bind_group(indexed_work_item_buffer), + non_indexed: self.create_indirect_frustum_culling_non_indexed_bind_group( + non_indexed_work_item_buffer, + ), + }) + } + + /// Creates the bind group for mesh preprocessing of indexed meshes when GPU + /// frustum culling is enabled, but GPU occlusion culling is disabled. + fn create_indirect_frustum_culling_indexed_bind_group( + &self, + indexed_work_item_buffer: &BufferVec, + ) -> Option { + let mesh_culling_data_buffer = self.mesh_culling_data_buffer.buffer()?; + let view_uniforms_binding = self.view_uniforms.uniforms.binding()?; + + match ( + self.indirect_parameters_buffers.indexed_metadata_buffer(), + indexed_work_item_buffer.buffer(), + ) { + (Some(indexed_metadata_buffer), Some(indexed_work_item_gpu_buffer)) => { + // Don't use `as_entire_binding()` here; the shader reads the array + // length and the underlying buffer may be longer than the actual size + // of the vector. + let indexed_work_item_buffer_size = NonZero::::try_from( + indexed_work_item_buffer.len() as u64 + * u64::from(PreprocessWorkItem::min_size()), + ) + .ok(); + + Some( + self.render_device.create_bind_group( + "preprocess_gpu_indexed_frustum_culling_bind_group", + &self + .pipelines + .gpu_frustum_culling_preprocess + .bind_group_layout, + &BindGroupEntries::with_indices(( + (3, self.current_input_buffer.as_entire_binding()), + (4, self.previous_input_buffer.as_entire_binding()), + ( + 5, + BindingResource::Buffer(BufferBinding { + buffer: indexed_work_item_gpu_buffer, + offset: 0, + size: indexed_work_item_buffer_size, + }), + ), + (6, self.data_buffer.as_entire_binding()), + (7, indexed_metadata_buffer.as_entire_binding()), + (8, mesh_culling_data_buffer.as_entire_binding()), + (0, view_uniforms_binding.clone()), + )), + ), + ) + } + _ => None, + } + } + + /// Creates the bind group for mesh preprocessing of non-indexed meshes when + /// GPU frustum culling is enabled, but GPU occlusion culling is disabled. + fn create_indirect_frustum_culling_non_indexed_bind_group( + &self, + non_indexed_work_item_buffer: &BufferVec, + ) -> Option { + let mesh_culling_data_buffer = self.mesh_culling_data_buffer.buffer()?; + let view_uniforms_binding = self.view_uniforms.uniforms.binding()?; + + match ( + self.indirect_parameters_buffers + .non_indexed_metadata_buffer(), + non_indexed_work_item_buffer.buffer(), + ) { + (Some(non_indexed_metadata_buffer), Some(non_indexed_work_item_gpu_buffer)) => { + // Don't use `as_entire_binding()` here; the shader reads the array + // length and the underlying buffer may be longer than the actual size + // of the vector. + let non_indexed_work_item_buffer_size = NonZero::::try_from( + non_indexed_work_item_buffer.len() as u64 + * u64::from(PreprocessWorkItem::min_size()), + ) + .ok(); + + Some( + self.render_device.create_bind_group( + "preprocess_gpu_non_indexed_frustum_culling_bind_group", + &self + .pipelines + .gpu_frustum_culling_preprocess + .bind_group_layout, + &BindGroupEntries::with_indices(( + (3, self.current_input_buffer.as_entire_binding()), + (4, self.previous_input_buffer.as_entire_binding()), + ( + 5, + BindingResource::Buffer(BufferBinding { + buffer: non_indexed_work_item_gpu_buffer, + offset: 0, + size: non_indexed_work_item_buffer_size, + }), + ), + (6, self.data_buffer.as_entire_binding()), + (7, non_indexed_metadata_buffer.as_entire_binding()), + (8, mesh_culling_data_buffer.as_entire_binding()), + (0, view_uniforms_binding.clone()), + )), + ), + ) + } + _ => None, } } } /// A system that creates bind groups from the indirect parameters metadata and -/// data buffers for the indirect parameter building shader. +/// data buffers for the indirect batch set reset shader and the indirect +/// parameter building shader. fn create_build_indirect_parameters_bind_groups( commands: &mut Commands, render_device: &RenderDevice, @@ -1061,7 +2337,45 @@ fn create_build_indirect_parameters_bind_groups( indirect_parameters_buffer: &IndirectParametersBuffers, ) { commands.insert_resource(BuildIndirectParametersBindGroups { - indexed: match ( + reset_indexed_indirect_batch_sets: match ( + indirect_parameters_buffer.indexed_batch_sets_buffer(), + ) { + (Some(indexed_batch_sets_buffer),) => Some( + render_device.create_bind_group( + "reset_indexed_indirect_batch_sets_bind_group", + // The early bind group is good for the main phase and late + // phase too. They bind the same buffers. + &pipelines + .early_phase + .reset_indirect_batch_sets + .bind_group_layout, + &BindGroupEntries::sequential((indexed_batch_sets_buffer.as_entire_binding(),)), + ), + ), + _ => None, + }, + + reset_non_indexed_indirect_batch_sets: match ( + indirect_parameters_buffer.non_indexed_batch_sets_buffer(), + ) { + (Some(non_indexed_batch_sets_buffer),) => Some( + render_device.create_bind_group( + "reset_non_indexed_indirect_batch_sets_bind_group", + // The early bind group is good for the main phase and late + // phase too. They bind the same buffers. + &pipelines + .early_phase + .reset_indirect_batch_sets + .bind_group_layout, + &BindGroupEntries::sequential(( + non_indexed_batch_sets_buffer.as_entire_binding(), + )), + ), + ), + _ => None, + }, + + build_indexed_indirect: match ( indirect_parameters_buffer.indexed_metadata_buffer(), indirect_parameters_buffer.indexed_data_buffer(), indirect_parameters_buffer.indexed_batch_sets_buffer(), @@ -1070,28 +2384,35 @@ fn create_build_indirect_parameters_bind_groups( Some(indexed_indirect_parameters_metadata_buffer), Some(indexed_indirect_parameters_data_buffer), Some(indexed_batch_sets_buffer), - ) => Some(render_device.create_bind_group( - "build_indexed_indirect_parameters_bind_group", - &pipelines.build_indexed_indirect_params.bind_group_layout, - &BindGroupEntries::sequential(( - current_input_buffer.as_entire_binding(), - // Don't use `as_entire_binding` here; the shader reads - // the length and `RawBufferVec` overallocates. - BufferBinding { - buffer: indexed_indirect_parameters_metadata_buffer, - offset: 0, - size: NonZeroU64::new( - indirect_parameters_buffer.indexed_batch_count() as u64 - * size_of::() as u64, - ), - }, - indexed_batch_sets_buffer.as_entire_binding(), - indexed_indirect_parameters_data_buffer.as_entire_binding(), - )), - )), + ) => Some( + render_device.create_bind_group( + "build_indexed_indirect_parameters_bind_group", + // The frustum culling bind group is good for occlusion culling + // too. They bind the same buffers. + &pipelines + .gpu_frustum_culling_build_indexed_indirect_params + .bind_group_layout, + &BindGroupEntries::sequential(( + current_input_buffer.as_entire_binding(), + // Don't use `as_entire_binding` here; the shader reads + // the length and `RawBufferVec` overallocates. + BufferBinding { + buffer: indexed_indirect_parameters_metadata_buffer, + offset: 0, + size: NonZeroU64::new( + indirect_parameters_buffer.indexed_batch_count() as u64 + * size_of::() as u64, + ), + }, + indexed_batch_sets_buffer.as_entire_binding(), + indexed_indirect_parameters_data_buffer.as_entire_binding(), + )), + ), + ), _ => None, }, - non_indexed: match ( + + build_non_indexed_indirect: match ( indirect_parameters_buffer.non_indexed_metadata_buffer(), indirect_parameters_buffer.non_indexed_data_buffer(), indirect_parameters_buffer.non_indexed_batch_sets_buffer(), @@ -1103,8 +2424,10 @@ fn create_build_indirect_parameters_bind_groups( ) => Some( render_device.create_bind_group( "build_non_indexed_indirect_parameters_bind_group", + // The frustum culling bind group is good for occlusion culling + // too. They bind the same buffers. &pipelines - .build_non_indexed_indirect_params + .gpu_frustum_culling_build_non_indexed_indirect_params .bind_group_layout, &BindGroupEntries::sequential(( current_input_buffer.as_entire_binding(), diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs index 96b783ea66..b7e8fcf6f1 100644 --- a/crates/bevy_pbr/src/render/mesh.rs +++ b/crates/bevy_pbr/src/render/mesh.rs @@ -87,6 +87,7 @@ pub const MESH_FUNCTIONS_HANDLE: Handle = Handle::weak_from_u128(6300874 pub const MESH_SHADER_HANDLE: Handle = Handle::weak_from_u128(3252377289100772450); pub const SKINNING_HANDLE: Handle = Handle::weak_from_u128(13215291596265391738); pub const MORPH_HANDLE: Handle = Handle::weak_from_u128(970982813587607345); +pub const OCCLUSION_CULLING_HANDLE: Handle = Handle::weak_from_u128(285365001154292827); /// How many textures are allowed in the view bind group layout (`@group(0)`) before /// broader compatibility with WebGL and WebGPU is at risk, due to the minimum guaranteed @@ -134,6 +135,12 @@ impl Plugin for MeshRenderPlugin { load_internal_asset!(app, MESH_SHADER_HANDLE, "mesh.wgsl", Shader::from_wgsl); load_internal_asset!(app, SKINNING_HANDLE, "skinning.wgsl", Shader::from_wgsl); load_internal_asset!(app, MORPH_HANDLE, "morph.wgsl", Shader::from_wgsl); + load_internal_asset!( + app, + OCCLUSION_CULLING_HANDLE, + "occlusion_culling.wgsl", + Shader::from_wgsl + ); if app.get_sub_app(RenderApp).is_none() { return; @@ -1254,9 +1261,10 @@ pub fn extract_meshes_for_gpu_building( mut removed_visibilities_query: Extract>, mut removed_global_transforms_query: Extract>, mut removed_meshes_query: Extract>, - cameras_query: Extract, Without)>>, + gpu_culling_query: Extract, Without)>>, ) { - let any_gpu_culling = !cameras_query.is_empty(); + let any_gpu_culling = !gpu_culling_query.is_empty(); + for render_mesh_instance_queue in render_mesh_instance_queues.iter_mut() { render_mesh_instance_queue.init(any_gpu_culling); } @@ -1761,7 +1769,8 @@ impl GetFullBatchData for MeshPipeline { Some(batch_set_index) => u32::from(batch_set_index), None => !0, }, - instance_count: 0, + early_instance_count: 0, + late_instance_count: 0, }; if indexed { diff --git a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl index df73454a3e..315dd13d3f 100644 --- a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl +++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl @@ -1,14 +1,28 @@ -// GPU mesh uniform building. +// GPU mesh transforming and culling. // // This is a compute shader that expands each `MeshInputUniform` out to a full -// `MeshUniform` for each view before rendering. (Thus `MeshInputUniform` -// and `MeshUniform` are in a 1:N relationship.) It runs in parallel for all -// meshes for all views. As part of this process, the shader gathers each -// mesh's transform on the previous frame and writes it into the `MeshUniform` -// so that TAA works. +// `MeshUniform` for each view before rendering. (Thus `MeshInputUniform` and +// `MeshUniform` are in a 1:N relationship.) It runs in parallel for all meshes +// for all views. As part of this process, the shader gathers each mesh's +// transform on the previous frame and writes it into the `MeshUniform` so that +// TAA works. It also performs frustum culling and occlusion culling, if +// requested. +// +// If occlusion culling is on, this shader runs twice: once to prepare the +// meshes that were visible last frame, and once to prepare the meshes that +// weren't visible last frame but became visible this frame. The two invocations +// are known as *early mesh preprocessing* and *late mesh preprocessing* +// respectively. +#import bevy_pbr::mesh_preprocess_types::{IndirectParametersMetadata, MeshInput} #import bevy_pbr::mesh_types::{Mesh, MESH_FLAGS_NO_FRUSTUM_CULLING_BIT} -#import bevy_pbr::mesh_preprocess_types::{MeshInput, IndirectParametersMetadata} +#import bevy_pbr::mesh_view_bindings::view +#import bevy_pbr::occlusion_culling +#import bevy_pbr::prepass_bindings::previous_view_uniforms +#import bevy_pbr::view_transformations::{ + position_world_to_ndc, position_world_to_view, ndc_to_uv, view_z_to_depth_ndc, + position_world_to_prev_ndc, position_world_to_prev_view, prev_view_z_to_depth_ndc +} #import bevy_render::maths #import bevy_render::view::View @@ -36,20 +50,48 @@ struct PreprocessWorkItem { indirect_parameters_index: u32, } +// The parameters for the indirect compute dispatch for the late mesh +// preprocessing phase. +struct LatePreprocessWorkItemIndirectParameters { + // The number of workgroups we're going to dispatch. + // + // This value should always be equal to `ceil(work_item_count / 64)`. + dispatch_x: atomic, + // The number of workgroups in the Y direction; always 1. + dispatch_y: u32, + // The number of workgroups in the Z direction; always 1. + dispatch_z: u32, + // The precise number of work items. + work_item_count: atomic, + // Padding. + // + // This isn't the usual structure padding; it's needed because some hardware + // requires indirect compute dispatch parameters to be aligned on 64-byte + // boundaries. + pad: vec4, +} + +// These have to be in a structure because of Naga limitations on DX12. +struct PushConstants { + // The offset into the `late_preprocess_work_item_indirect_parameters` + // buffer. + late_preprocess_work_item_indirect_offset: u32, +} + // The current frame's `MeshInput`. -@group(0) @binding(0) var current_input: array; +@group(0) @binding(3) var current_input: array; // The `MeshInput` values from the previous frame. -@group(0) @binding(1) var previous_input: array; +@group(0) @binding(4) var previous_input: array; // Indices into the `MeshInput` buffer. // // There may be many indices that map to the same `MeshInput`. -@group(0) @binding(2) var work_items: array; +@group(0) @binding(5) var work_items: array; // The output array of `Mesh`es. -@group(0) @binding(3) var output: array; +@group(0) @binding(6) var output: array; #ifdef INDIRECT // The array of indirect parameters for drawcalls. -@group(0) @binding(4) var indirect_parameters_metadata: +@group(0) @binding(7) var indirect_parameters_metadata: array; #endif @@ -57,11 +99,24 @@ struct PreprocessWorkItem { // Data needed to cull the meshes. // // At the moment, this consists only of AABBs. -@group(0) @binding(5) var mesh_culling_data: array; +@group(0) @binding(8) var mesh_culling_data: array; +#endif // FRUSTUM_CULLING -// The view data, including the view matrix. -@group(0) @binding(6) var view: View; +#ifdef OCCLUSION_CULLING +@group(0) @binding(10) var depth_pyramid: texture_2d; +#ifdef EARLY_PHASE +@group(0) @binding(11) var late_preprocess_work_items: + array; +#endif // EARLY_PHASE + +@group(0) @binding(12) var late_preprocess_work_item_indirect_parameters: + array; + +var push_constants: PushConstants; +#endif // OCCLUSION_CULLING + +#ifdef FRUSTUM_CULLING // Returns true if the view frustum intersects an oriented bounding box (OBB). // // `aabb_center.w` should be 1.0. @@ -102,9 +157,17 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { // Figure out our instance index. If this thread doesn't correspond to any // index, bail. let instance_index = global_invocation_id.x; + +#ifdef LATE_PHASE + if (instance_index >= atomicLoad(&late_preprocess_work_item_indirect_parameters[ + push_constants.late_preprocess_work_item_indirect_offset].work_item_count)) { + return; + } +#else // LATE_PHASE if (instance_index >= arrayLength(&work_items)) { return; } +#endif // Unpack the work item. let input_index = work_items[instance_index].input_index; @@ -115,7 +178,7 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { let world_from_local_affine_transpose = current_input[input_index].world_from_local; let world_from_local = maths::affine3_to_square(world_from_local_affine_transpose); - // Cull if necessary. + // Frustum cull if necessary. #ifdef FRUSTUM_CULLING if ((current_input[input_index].flags & MESH_FLAGS_NO_FRUSTUM_CULLING_BIT) == 0u) { let aabb_center = mesh_culling_data[input_index].aabb_center.xyz; @@ -129,6 +192,112 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { } #endif + // Look up the previous model matrix. + let previous_input_index = current_input[input_index].previous_input_index; + var previous_world_from_local_affine_transpose: mat3x4; + if (previous_input_index == 0xffffffff) { + previous_world_from_local_affine_transpose = world_from_local_affine_transpose; + } else { + previous_world_from_local_affine_transpose = + previous_input[previous_input_index].world_from_local; + } + let previous_world_from_local = + maths::affine3_to_square(previous_world_from_local_affine_transpose); + + // Occlusion cull if necessary. This is done by calculating the screen-space + // axis-aligned bounding box (AABB) of the mesh and testing it against the + // appropriate level of the depth pyramid (a.k.a. hierarchical Z-buffer). If + // no part of the AABB is in front of the corresponding pixel quad in the + // hierarchical Z-buffer, then this mesh must be occluded, and we can skip + // rendering it. +#ifdef OCCLUSION_CULLING + let aabb_center = mesh_culling_data[input_index].aabb_center.xyz; + let aabb_half_extents = mesh_culling_data[input_index].aabb_half_extents.xyz; + + // Initialize the AABB and the maximum depth. + let infinity = bitcast(0x7f800000u); + let neg_infinity = bitcast(0xff800000u); + var aabb = vec4(infinity, infinity, neg_infinity, neg_infinity); + var max_depth_view = neg_infinity; + + // Build up the AABB by taking each corner of this mesh's OBB, transforming + // it, and updating the AABB and depth accordingly. + for (var i = 0u; i < 8u; i += 1u) { + let local_pos = aabb_center + select( + vec3(-1.0), + vec3(1.0), + vec3((i & 1) != 0, (i & 2) != 0, (i & 4) != 0) + ) * aabb_half_extents; + +#ifdef EARLY_PHASE + // If we're in the early phase, we're testing against the last frame's + // depth buffer, so we need to use the previous frame's transform. + let prev_world_pos = (previous_world_from_local * vec4(local_pos, 1.0)).xyz; + let view_pos = position_world_to_prev_view(prev_world_pos); + let ndc_pos = position_world_to_prev_ndc(prev_world_pos); +#else // EARLY_PHASE + // Otherwise, if this is the late phase, we use the current frame's + // transform. + let world_pos = (world_from_local * vec4(local_pos, 1.0)).xyz; + let view_pos = position_world_to_view(world_pos); + let ndc_pos = position_world_to_ndc(world_pos); +#endif // EARLY_PHASE + + let uv_pos = ndc_to_uv(ndc_pos.xy); + + // Update the AABB and maximum view-space depth. + aabb = vec4(min(aabb.xy, uv_pos), max(aabb.zw, uv_pos)); + max_depth_view = max(max_depth_view, view_pos.z); + } + + // Clip to the near plane to avoid the NDC depth becoming negative. +#ifdef EARLY_PHASE + max_depth_view = min(-previous_view_uniforms.clip_from_view[3][2], max_depth_view); +#else // EARLY_PHASE + max_depth_view = min(-view.clip_from_view[3][2], max_depth_view); +#endif // EARLY_PHASE + + // Figure out the depth of the occluder, and compare it to our own depth. + + let aabb_pixel_size = occlusion_culling::get_aabb_size_in_pixels(aabb, depth_pyramid); + let occluder_depth_ndc = + occlusion_culling::get_occluder_depth(aabb, aabb_pixel_size, depth_pyramid); + +#ifdef EARLY_PHASE + let max_depth_ndc = prev_view_z_to_depth_ndc(max_depth_view); +#else // EARLY_PHASE + let max_depth_ndc = view_z_to_depth_ndc(max_depth_view); +#endif + + // Are we culled out? + if (max_depth_ndc < occluder_depth_ndc) { +#ifdef EARLY_PHASE + // If this is the early phase, we need to make a note of this mesh so + // that we examine it again in the late phase, so that we handle the + // case in which a mesh that was invisible last frame became visible in + // this frame. + let output_work_item_index = atomicAdd(&late_preprocess_work_item_indirect_parameters[ + push_constants.late_preprocess_work_item_indirect_offset].work_item_count, 1u); + if (output_work_item_index % 64u == 0u) { + // Our workgroup size is 64, and the indirect parameters for the + // late mesh preprocessing phase are counted in workgroups, so if + // we're the first thread in this workgroup, bump the workgroup + // count. + atomicAdd(&late_preprocess_work_item_indirect_parameters[ + push_constants.late_preprocess_work_item_indirect_offset].dispatch_x, 1u); + } + + // Enqueue a work item for the late prepass phase. + late_preprocess_work_items[output_work_item_index].input_index = input_index; + late_preprocess_work_items[output_work_item_index].output_index = output_index; + late_preprocess_work_items[output_work_item_index].indirect_parameters_index = + indirect_parameters_index; +#endif // EARLY_PHASE + // This mesh is culled. Skip it. + return; + } +#endif // OCCLUSION_CULLING + // Calculate inverse transpose. let local_from_world_transpose = transpose(maths::inverse_affine3(transpose( world_from_local_affine_transpose))); @@ -139,32 +308,34 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { vec4(local_from_world_transpose[1].yz, local_from_world_transpose[2].xy)); let local_from_world_transpose_b = local_from_world_transpose[2].z; - // Look up the previous model matrix. - let previous_input_index = current_input[input_index].previous_input_index; - var previous_world_from_local: mat3x4; - if (previous_input_index == 0xffffffff) { - previous_world_from_local = world_from_local_affine_transpose; - } else { - previous_world_from_local = previous_input[previous_input_index].world_from_local; - } - // Figure out the output index. In indirect mode, this involves bumping the // instance index in the indirect parameters metadata, which // `build_indirect_params.wgsl` will use to generate the actual indirect // parameters. Otherwise, this index was directly supplied to us. #ifdef INDIRECT +#ifdef LATE_PHASE let batch_output_index = - atomicAdd(&indirect_parameters_metadata[indirect_parameters_index].instance_count, 1u); + atomicLoad(&indirect_parameters_metadata[indirect_parameters_index].early_instance_count) + + atomicAdd(&indirect_parameters_metadata[indirect_parameters_index].late_instance_count, 1u); +#else // LATE_PHASE + let batch_output_index = atomicAdd( + &indirect_parameters_metadata[indirect_parameters_index].early_instance_count, + 1u + ); +#endif // LATE_PHASE + let mesh_output_index = indirect_parameters_metadata[indirect_parameters_index].base_output_index + batch_output_index; + #else // INDIRECT let mesh_output_index = output_index; #endif // INDIRECT // Write the output. output[mesh_output_index].world_from_local = world_from_local_affine_transpose; - output[mesh_output_index].previous_world_from_local = previous_world_from_local; + output[mesh_output_index].previous_world_from_local = + previous_world_from_local_affine_transpose; output[mesh_output_index].local_from_world_transpose_a = local_from_world_transpose_a; output[mesh_output_index].local_from_world_transpose_b = local_from_world_transpose_b; output[mesh_output_index].flags = current_input[input_index].flags; diff --git a/crates/bevy_pbr/src/render/mesh_preprocess_types.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess_types.wgsl deleted file mode 100644 index 974a9d303a..0000000000 --- a/crates/bevy_pbr/src/render/mesh_preprocess_types.wgsl +++ /dev/null @@ -1,98 +0,0 @@ -// Types needed for GPU mesh uniform building. - -#define_import_path bevy_pbr::mesh_preprocess_types - -// Per-frame data that the CPU supplies to the GPU. -struct MeshInput { - // The model transform. - world_from_local: mat3x4, - // The lightmap UV rect, packed into 64 bits. - lightmap_uv_rect: vec2, - // A set of bitflags corresponding to `MeshFlags` on the Rust side. See the - // `MESH_FLAGS_` flags in `mesh_types.wgsl` for a list of these. - flags: u32, - // The index of this mesh's `MeshInput` in the `previous_input` array, if - // applicable. If not present, this is `u32::MAX`. - previous_input_index: u32, - // The index of the first vertex in the vertex slab. - first_vertex_index: u32, - // The index of the first vertex index in the index slab. - // - // If this mesh isn't indexed, this value is ignored. - first_index_index: u32, - // For indexed meshes, the number of indices that this mesh has; for - // non-indexed meshes, the number of vertices that this mesh consists of. - index_count: u32, - current_skin_index: u32, - previous_skin_index: u32, - // Low 16 bits: index of the material inside the bind group data. - // High 16 bits: index of the lightmap in the binding array. - material_and_lightmap_bind_group_slot: u32, -} - -// The `wgpu` indirect parameters structure for indexed meshes. -// -// The `build_indirect_params.wgsl` shader generates these. -struct IndirectParametersIndexed { - // The number of indices that this mesh has. - index_count: u32, - // The number of instances we are to draw. - instance_count: u32, - // The offset of the first index for this mesh in the index buffer slab. - first_index: u32, - // The offset of the first vertex for this mesh in the vertex buffer slab. - base_vertex: u32, - // The index of the first mesh instance in the `Mesh` buffer. - first_instance: u32, -} - -// The `wgpu` indirect parameters structure for non-indexed meshes. -// -// The `build_indirect_params.wgsl` shader generates these. -struct IndirectParametersNonIndexed { - // The number of vertices that this mesh has. - vertex_count: u32, - // The number of instances we are to draw. - instance_count: u32, - // The offset of the first vertex for this mesh in the vertex buffer slab. - base_vertex: u32, - // The index of the first mesh instance in the `Mesh` buffer. - first_instance: u32, -} - -// Information needed to generate the `IndirectParametersIndexed` and -// `IndirectParametersNonIndexed` draw commands. -struct IndirectParametersMetadata { - // The index of the mesh in the `MeshInput` buffer. - mesh_index: u32, - // The index of the first instance corresponding to this batch in the `Mesh` - // buffer. - base_output_index: u32, - // The index of the batch set in the `IndirectBatchSet` buffer. - batch_set_index: u32, - // The number of instances that are to be drawn. - // - // The `mesh_preprocess.wgsl` shader determines this, and the - // `build_indirect_params.wgsl` shader copies this value into the indirect - // draw command. - instance_count: atomic, -} - -// Information about each batch set. -// -// A *batch set* is a set of meshes that might be multi-drawn together. -// -// The CPU creates this structure, and the `build_indirect_params.wgsl` shader -// modifies it. If `multi_draw_indirect_count` is in use, the GPU reads this -// value when multi-drawing a batch set in order to determine how many commands -// make up the batch set. -struct IndirectBatchSet { - // The number of commands that make up this batch set. - // - // The CPU initializes this value to zero. The `build_indirect_params.wgsl` - // shader increments this value as it processes batches. - indirect_parameters_count: atomic, - // The offset of the first batch corresponding to this batch set within the - // `IndirectParametersIndexed` or `IndirectParametersNonIndexed` arrays. - indirect_parameters_base: u32, -} diff --git a/crates/bevy_pbr/src/render/occlusion_culling.wgsl b/crates/bevy_pbr/src/render/occlusion_culling.wgsl new file mode 100644 index 0000000000..1be999cc6a --- /dev/null +++ b/crates/bevy_pbr/src/render/occlusion_culling.wgsl @@ -0,0 +1,30 @@ +// Occlusion culling utility functions. + +#define_import_path bevy_pbr::occlusion_culling + +fn get_aabb_size_in_pixels(aabb: vec4, depth_pyramid: texture_2d) -> vec2 { + let depth_pyramid_size_mip_0 = vec2(textureDimensions(depth_pyramid, 0)); + let aabb_width_pixels = (aabb.z - aabb.x) * depth_pyramid_size_mip_0.x; + let aabb_height_pixels = (aabb.w - aabb.y) * depth_pyramid_size_mip_0.y; + return vec2(aabb_width_pixels, aabb_height_pixels); +} + +fn get_occluder_depth( + aabb: vec4, + aabb_pixel_size: vec2, + depth_pyramid: texture_2d +) -> f32 { + let aabb_width_pixels = aabb_pixel_size.x; + let aabb_height_pixels = aabb_pixel_size.y; + + let depth_pyramid_size_mip_0 = vec2(textureDimensions(depth_pyramid, 0)); + let depth_level = max(0, i32(ceil(log2(max(aabb_width_pixels, aabb_height_pixels))))); // TODO: Naga doesn't like this being a u32 + let depth_pyramid_size = vec2(textureDimensions(depth_pyramid, depth_level)); + let aabb_top_left = vec2(aabb.xy * depth_pyramid_size); + + let depth_quad_a = textureLoad(depth_pyramid, aabb_top_left, depth_level).x; + let depth_quad_b = textureLoad(depth_pyramid, aabb_top_left + vec2(1u, 0u), depth_level).x; + let depth_quad_c = textureLoad(depth_pyramid, aabb_top_left + vec2(0u, 1u), depth_level).x; + let depth_quad_d = textureLoad(depth_pyramid, aabb_top_left + vec2(1u, 1u), depth_level).x; + return min(min(depth_quad_a, depth_quad_b), min(depth_quad_c, depth_quad_d)); +} diff --git a/crates/bevy_pbr/src/render/reset_indirect_batch_sets.wgsl b/crates/bevy_pbr/src/render/reset_indirect_batch_sets.wgsl new file mode 100644 index 0000000000..9309594725 --- /dev/null +++ b/crates/bevy_pbr/src/render/reset_indirect_batch_sets.wgsl @@ -0,0 +1,25 @@ +// Resets the indirect draw counts to zero. +// +// This shader is needed because we reuse the same indirect batch set count +// buffer (i.e. the buffer that gets passed to `multi_draw_indirect_count` to +// determine how many objects to draw) between phases (early, late, and main). +// Before launching `build_indirect_params.wgsl`, we need to reinitialize the +// value to 0. + +#import bevy_pbr::mesh_preprocess_types::IndirectBatchSet + +@group(0) @binding(0) var indirect_batch_sets: array; + +@compute +@workgroup_size(64) +fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { + // Figure out our instance index. If this thread doesn't correspond to any + // index, bail. + let instance_index = global_invocation_id.x; + if (instance_index >= arrayLength(&indirect_batch_sets)) { + return; + } + + // Reset the number of batch sets to 0. + atomicStore(&indirect_batch_sets[instance_index].indirect_parameters_count, 0u); +} diff --git a/crates/bevy_pbr/src/render/view_transformations.wgsl b/crates/bevy_pbr/src/render/view_transformations.wgsl index 63ee78a0c0..80c26d7b69 100644 --- a/crates/bevy_pbr/src/render/view_transformations.wgsl +++ b/crates/bevy_pbr/src/render/view_transformations.wgsl @@ -1,6 +1,7 @@ #define_import_path bevy_pbr::view_transformations #import bevy_pbr::mesh_view_bindings as view_bindings +#import bevy_pbr::prepass_bindings /// World space: /// +y is up @@ -93,6 +94,22 @@ fn direction_clip_to_view(clip_dir: vec4) -> vec3 { return view_dir.xyz; } +// ----------------- +// TO PREV. VIEW --- +// ----------------- + +fn position_world_to_prev_view(world_pos: vec3) -> vec3 { + let view_pos = prepass_bindings::previous_view_uniforms.view_from_world * + vec4(world_pos, 1.0); + return view_pos.xyz; +} + +fn position_world_to_prev_ndc(world_pos: vec3) -> vec3 { + let ndc_pos = prepass_bindings::previous_view_uniforms.clip_from_world * + vec4(world_pos, 1.0); + return ndc_pos.xyz / ndc_pos.w; +} + // ----------------- // TO CLIP --------- // ----------------- @@ -172,6 +189,19 @@ fn view_z_to_depth_ndc(view_z: f32) -> f32 { #endif } +fn prev_view_z_to_depth_ndc(view_z: f32) -> f32 { +#ifdef VIEW_PROJECTION_PERSPECTIVE + return -perspective_camera_near() / view_z; +#else ifdef VIEW_PROJECTION_ORTHOGRAPHIC + return prepass_bindings::previous_view_uniforms.clip_from_view[3][2] + + view_z * prepass_bindings::previous_view_uniforms.clip_from_view[2][2]; +#else + let ndc_pos = prepass_bindings::previous_view_uniforms.clip_from_view * + vec4(0.0, 0.0, view_z, 1.0); + return ndc_pos.z / ndc_pos.w; +#endif +} + // ----------------- // UV -------------- // ----------------- diff --git a/crates/bevy_render/src/batching/gpu_preprocessing.rs b/crates/bevy_render/src/batching/gpu_preprocessing.rs index a650951793..7effe9d1d7 100644 --- a/crates/bevy_render/src/batching/gpu_preprocessing.rs +++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs @@ -12,6 +12,8 @@ use bevy_ecs::{ world::{FromWorld, World}, }; use bevy_encase_derive::ShaderType; +use bevy_math::UVec4; +use bevy_platform_support::collections::hash_map::Entry; use bevy_utils::{default, TypeIdMap}; use bytemuck::{Pod, Zeroable}; use nonmax::NonMaxU32; @@ -19,6 +21,7 @@ use tracing::error; use wgpu::{BindingResource, BufferUsages, DownlevelFlags, Features}; use crate::{ + experimental::occlusion_culling::OcclusionCulling, render_phase::{ BinnedPhaseItem, BinnedRenderPhaseBatch, BinnedRenderPhaseBatchSet, BinnedRenderPhaseBatchSets, CachedRenderPipelinePhaseItem, PhaseItemBatchSetKey as _, @@ -33,7 +36,15 @@ use crate::{ use super::{BatchMeta, GetBatchData, GetFullBatchData}; -pub struct BatchingPlugin; +#[derive(Default)] +pub struct BatchingPlugin { + /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so + /// that they can be read back to CPU. + /// + /// This is a debugging feature that may reduce performance. It primarily + /// exists for the `occlusion_culling` example. + pub allow_copies_from_indirect_parameters: bool, +} impl Plugin for BatchingPlugin { fn build(&self, app: &mut App) { @@ -42,7 +53,9 @@ impl Plugin for BatchingPlugin { }; render_app - .insert_resource(IndirectParametersBuffers::new()) + .insert_resource(IndirectParametersBuffers::new( + self.allow_copies_from_indirect_parameters, + )) .add_systems( Render, write_indirect_parameters_buffers.in_set(RenderSet::PrepareResourcesFlush), @@ -159,6 +172,26 @@ where /// data input uniform is expected to contain the index of the /// corresponding buffer data input uniform in this list. pub previous_input_buffer: InstanceInputUniformBuffer, + + /// A buffer that holds the number of indexed meshes that weren't visible in + /// the previous frame, when GPU occlusion culling is in use. + /// + /// There's one set of [`LatePreprocessWorkItemIndirectParameters`] per + /// view. Bevy uses this value to determine how many threads to dispatch to + /// check meshes that weren't visible next frame to see if they became newly + /// visible this frame. + pub late_indexed_indirect_parameters_buffer: + RawBufferVec, + + /// A buffer that holds the number of non-indexed meshes that weren't + /// visible in the previous frame, when GPU occlusion culling is in use. + /// + /// There's one set of [`LatePreprocessWorkItemIndirectParameters`] per + /// view. Bevy uses this value to determine how many threads to dispatch to + /// check meshes that weren't visible next frame to see if they became newly + /// visible this frame. + pub late_non_indexed_indirect_parameters_buffer: + RawBufferVec, } /// Holds the GPU buffer of instance input data, which is the data about each @@ -260,6 +293,23 @@ where self.buffer.push(default()); } } + + /// Returns the number of instances in this buffer. + pub fn len(&self) -> usize { + self.buffer.len() + } + + /// Returns true if this buffer has no instances or false if it contains any + /// instances. + pub fn is_empty(&self) -> bool { + self.buffer.is_empty() + } + + /// Consumes this [`InstanceInputUniformBuffer`] and returns the raw buffer + /// ready to be uploaded to the GPU. + pub fn into_buffer(self) -> RawBufferVec { + self.buffer + } } impl Default for InstanceInputUniformBuffer @@ -289,25 +339,130 @@ pub enum PreprocessWorkItemBuffers { indexed: BufferVec, /// The buffer of work items corresponding to non-indexed meshes. non_indexed: BufferVec, + /// The work item buffers we use when GPU occlusion culling is in use. + gpu_occlusion_culling: Option, }, } -impl PreprocessWorkItemBuffers { - /// Creates a new set of buffers. +/// The work item buffers we use when GPU occlusion culling is in use. +pub struct GpuOcclusionCullingWorkItemBuffers { + /// The buffer of work items corresponding to indexed meshes. + pub late_indexed: UninitBufferVec, + /// The buffer of work items corresponding to non-indexed meshes. + pub late_non_indexed: UninitBufferVec, + /// The offset into the + /// [`BatchedInstanceBuffers::late_indexed_indirect_parameters_buffer`] + /// where this view's indirect dispatch counts for indexed meshes live. + pub late_indirect_parameters_indexed_offset: u32, + /// The offset into the + /// [`BatchedInstanceBuffers::late_non_indexed_indirect_parameters_buffer`] + /// where this view's indirect dispatch counts for non-indexed meshes live. + pub late_indirect_parameters_non_indexed_offset: u32, +} + +/// A GPU-side data structure that stores the number of workgroups to dispatch +/// for the second phase of GPU occlusion culling. +/// +/// The late mesh preprocessing phase checks meshes that weren't visible frame +/// to determine if they're potentially visible this frame. +#[derive(Clone, Copy, ShaderType, Pod, Zeroable)] +#[repr(C)] +pub struct LatePreprocessWorkItemIndirectParameters { + /// The number of workgroups to dispatch. /// - /// `no_indirect_drawing` specifies whether we're drawing directly or - /// indirectly. - pub fn new(no_indirect_drawing: bool) -> Self { - if no_indirect_drawing { - PreprocessWorkItemBuffers::Direct(BufferVec::new(BufferUsages::STORAGE)) - } else { - PreprocessWorkItemBuffers::Indirect { - indexed: BufferVec::new(BufferUsages::STORAGE), - non_indexed: BufferVec::new(BufferUsages::STORAGE), + /// This will be equal to `work_item_count / 64`, rounded *up*. + dispatch_x: u32, + /// The number of workgroups along the abstract Y axis to dispatch: always + /// 1. + dispatch_y: u32, + /// The number of workgroups along the abstract Z axis to dispatch: always + /// 1. + dispatch_z: u32, + /// The actual number of work items. + /// + /// The GPU indirect dispatch doesn't read this, but it's used internally to + /// determine the actual number of work items that exist in the late + /// preprocessing work item buffer. + work_item_count: u32, + /// Padding to 64-byte boundaries for some hardware. + pad: UVec4, +} + +impl Default for LatePreprocessWorkItemIndirectParameters { + fn default() -> LatePreprocessWorkItemIndirectParameters { + LatePreprocessWorkItemIndirectParameters { + dispatch_x: 0, + dispatch_y: 1, + dispatch_z: 1, + work_item_count: 0, + pad: default(), + } + } +} + +/// Returns the set of work item buffers for the given view, first creating it +/// if necessary. +/// +/// Bevy uses work item buffers to tell the mesh preprocessing compute shader +/// which meshes are to be drawn. +/// +/// You may need to call this function if you're implementing your own custom +/// render phases. See the `specialized_mesh_pipeline` example. +pub fn get_or_create_work_item_buffer<'a, I>( + work_item_buffers: &'a mut EntityHashMap>, + view: Entity, + no_indirect_drawing: bool, + gpu_occlusion_culling: bool, + late_indexed_indirect_parameters_buffer: &'_ mut RawBufferVec< + LatePreprocessWorkItemIndirectParameters, + >, + late_non_indexed_indirect_parameters_buffer: &'_ mut RawBufferVec< + LatePreprocessWorkItemIndirectParameters, + >, +) -> &'a mut PreprocessWorkItemBuffers +where + I: 'static, +{ + match work_item_buffers + .entry(view) + .or_default() + .entry(TypeId::of::()) + { + Entry::Occupied(occupied_entry) => occupied_entry.into_mut(), + Entry::Vacant(vacant_entry) => { + if no_indirect_drawing { + vacant_entry.insert(PreprocessWorkItemBuffers::Direct(BufferVec::new( + BufferUsages::STORAGE, + ))) + } else { + vacant_entry.insert(PreprocessWorkItemBuffers::Indirect { + indexed: BufferVec::new(BufferUsages::STORAGE), + non_indexed: BufferVec::new(BufferUsages::STORAGE), + gpu_occlusion_culling: if gpu_occlusion_culling { + let late_indirect_parameters_indexed_offset = + late_indexed_indirect_parameters_buffer + .push(LatePreprocessWorkItemIndirectParameters::default()); + let late_indirect_parameters_non_indexed_offset = + late_non_indexed_indirect_parameters_buffer + .push(LatePreprocessWorkItemIndirectParameters::default()); + Some(GpuOcclusionCullingWorkItemBuffers { + late_indexed: UninitBufferVec::new(BufferUsages::STORAGE), + late_non_indexed: UninitBufferVec::new(BufferUsages::STORAGE), + late_indirect_parameters_indexed_offset: + late_indirect_parameters_indexed_offset as u32, + late_indirect_parameters_non_indexed_offset: + late_indirect_parameters_non_indexed_offset as u32, + }) + } else { + None + }, + }) } } } +} +impl PreprocessWorkItemBuffers { /// Adds a new work item to the appropriate buffer. /// /// `indexed` specifies whether the work item corresponds to an indexed @@ -320,12 +475,21 @@ impl PreprocessWorkItemBuffers { PreprocessWorkItemBuffers::Indirect { indexed: ref mut indexed_buffer, non_indexed: ref mut non_indexed_buffer, + ref mut gpu_occlusion_culling, } => { if indexed { indexed_buffer.push(preprocess_work_item); } else { non_indexed_buffer.push(preprocess_work_item); } + + if let Some(ref mut gpu_occlusion_culling) = *gpu_occlusion_culling { + if indexed { + gpu_occlusion_culling.late_indexed.add(); + } else { + gpu_occlusion_culling.late_non_indexed.add(); + } + } } } } @@ -353,7 +517,7 @@ pub struct PreprocessWorkItem { /// /// This is the variant for indexed meshes. We generate the instances of this /// structure in the `build_indirect_params.wgsl` compute shader. -#[derive(Clone, Copy, Pod, Zeroable, ShaderType)] +#[derive(Clone, Copy, Debug, Pod, Zeroable, ShaderType)] #[repr(C)] pub struct IndirectParametersIndexed { /// The number of indices that this mesh has. @@ -372,7 +536,7 @@ pub struct IndirectParametersIndexed { /// /// This is the variant for non-indexed meshes. We generate the instances of /// this structure in the `build_indirect_params.wgsl` compute shader. -#[derive(Clone, Copy, Pod, Zeroable, ShaderType)] +#[derive(Clone, Copy, Debug, Pod, Zeroable, ShaderType)] #[repr(C)] pub struct IndirectParametersNonIndexed { /// The number of vertices that this mesh has. @@ -389,10 +553,11 @@ pub struct IndirectParametersNonIndexed { /// each mesh are actually to be drawn. /// /// The CPU writes to this structure in order to initialize the fields other -/// than [`Self::instance_count`]. The GPU mesh preprocessing shader increments -/// the [`Self::instance_count`] as it determines that meshes are visible. The -/// indirect parameter building shader reads this metadata in order to construct -/// the indirect draw parameters. +/// than [`Self::early_instance_count`] and [`Self::late_instance_count`]. The +/// GPU mesh preprocessing shader increments the [`Self::early_instance_count`] +/// and [`Self::late_instance_count`] as it determines that meshes are visible. +/// The indirect parameter building shader reads this metadata in order to +/// construct the indirect draw parameters. /// /// Each batch will have one instance of this structure. #[derive(Clone, Copy, Default, Pod, Zeroable, ShaderType)] @@ -419,11 +584,18 @@ pub struct IndirectParametersMetadata { /// set. pub batch_set_index: u32, - /// The number of instances that have been judged potentially visible. + /// The number of instances that were judged visible last frame. /// /// The CPU sets this value to 0, and the GPU mesh preprocessing shader /// increments it as it culls mesh instances. - pub instance_count: u32, + pub early_instance_count: u32, + + /// The number of instances that have been judged potentially visible this + /// frame that weren't in the last frame's potentially visible set. + /// + /// The CPU sets this value to 0, and the GPU mesh preprocessing shader + /// increments it as it culls mesh instances. + pub late_instance_count: u32, } /// A structure, shared between CPU and GPU, that holds the number of on-GPU @@ -520,16 +692,19 @@ pub struct IndirectParametersBuffers { impl IndirectParametersBuffers { /// Creates the indirect parameters buffers. - pub fn new() -> IndirectParametersBuffers { + pub fn new(allow_copies_from_indirect_parameter_buffers: bool) -> IndirectParametersBuffers { + let mut indirect_parameter_buffer_usages = BufferUsages::STORAGE | BufferUsages::INDIRECT; + if allow_copies_from_indirect_parameter_buffers { + indirect_parameter_buffer_usages |= BufferUsages::COPY_SRC; + } + IndirectParametersBuffers { - non_indexed_data: UninitBufferVec::new(BufferUsages::STORAGE | BufferUsages::INDIRECT), + non_indexed_data: UninitBufferVec::new(indirect_parameter_buffer_usages), non_indexed_metadata: RawBufferVec::new(BufferUsages::STORAGE), - non_indexed_batch_sets: RawBufferVec::new( - BufferUsages::STORAGE | BufferUsages::INDIRECT, - ), - indexed_data: UninitBufferVec::new(BufferUsages::STORAGE | BufferUsages::INDIRECT), + non_indexed_batch_sets: RawBufferVec::new(indirect_parameter_buffer_usages), + indexed_data: UninitBufferVec::new(indirect_parameter_buffer_usages), indexed_metadata: RawBufferVec::new(BufferUsages::STORAGE), - indexed_batch_sets: RawBufferVec::new(BufferUsages::STORAGE | BufferUsages::INDIRECT), + indexed_batch_sets: RawBufferVec::new(indirect_parameter_buffer_usages), } } @@ -617,9 +792,9 @@ impl IndirectParametersBuffers { /// Reserves space for `count` new batches corresponding to non-indexed /// meshes. /// - /// This allocates in both the [`Self::non_indexed_metadata`] and - /// [`Self::non_indexed_data`] buffers. - fn allocate_non_indexed(&mut self, count: u32) -> u32 { + /// This allocates in both the `non_indexed_metadata` and `non_indexed_data` + /// buffers. + pub fn allocate_non_indexed(&mut self, count: u32) -> u32 { let length = self.non_indexed_data.len(); self.non_indexed_metadata.reserve_internal(count as usize); for _ in 0..count { @@ -712,11 +887,17 @@ impl IndirectParametersBuffers { }); } } + + pub fn get_next_batch_set_index(&self, indexed: bool) -> Option { + NonMaxU32::new(self.batch_set_count(indexed) as u32) + } } impl Default for IndirectParametersBuffers { fn default() -> Self { - Self::new() + // By default, we don't allow GPU indirect parameter mapping, since + // that's a debugging option. + Self::new(false) } } @@ -755,7 +936,7 @@ impl FromWorld for GpuPreprocessingSupport { impl BatchedInstanceBuffers where BD: GpuArrayBufferable + Sync + Send + 'static, - BDI: Pod + Default, + BDI: Pod + Sync + Send + Default + 'static, { /// Creates new buffers. pub fn new() -> Self { @@ -764,6 +945,12 @@ where work_item_buffers: EntityHashMap::default(), current_input_buffer: InstanceInputUniformBuffer::new(), previous_input_buffer: InstanceInputUniformBuffer::new(), + late_indexed_indirect_parameters_buffer: RawBufferVec::new( + BufferUsages::STORAGE | BufferUsages::INDIRECT, + ), + late_non_indexed_indirect_parameters_buffer: RawBufferVec::new( + BufferUsages::STORAGE | BufferUsages::INDIRECT, + ), } } @@ -779,28 +966,16 @@ where /// Clears out the buffers in preparation for a new frame. pub fn clear(&mut self) { self.data_buffer.clear(); - - for view_work_item_buffers in self.work_item_buffers.values_mut() { - for phase_work_item_buffers in view_work_item_buffers.values_mut() { - match *phase_work_item_buffers { - PreprocessWorkItemBuffers::Direct(ref mut buffer_vec) => buffer_vec.clear(), - PreprocessWorkItemBuffers::Indirect { - ref mut indexed, - ref mut non_indexed, - } => { - indexed.clear(); - non_indexed.clear(); - } - } - } - } + self.late_indexed_indirect_parameters_buffer.clear(); + self.late_non_indexed_indirect_parameters_buffer.clear(); + self.work_item_buffers.clear(); } } impl Default for BatchedInstanceBuffers where BD: GpuArrayBufferable + Sync + Send + 'static, - BDI: Pod + Default, + BDI: Pod + Default + Sync + Send + 'static, { fn default() -> Self { Self::new() @@ -856,9 +1031,14 @@ where let (batch_range, batch_extra_index) = phase.items[self.phase_item_start_index as usize].batch_range_and_extra_index_mut(); *batch_range = self.instance_start_index..instance_end_index; - *batch_extra_index = - PhaseItemExtraIndex::maybe_indirect_parameters_index(self.indirect_parameters_index); - + *batch_extra_index = match self.indirect_parameters_index { + Some(indirect_parameters_index) => PhaseItemExtraIndex::IndirectParametersIndex { + range: u32::from(indirect_parameters_index) + ..(u32::from(indirect_parameters_index) + 1), + batch_set_index: None, + }, + None => PhaseItemExtraIndex::None, + }; if let Some(indirect_parameters_index) = self.indirect_parameters_index { indirect_parameters_buffers .add_batch_set(self.indexed, indirect_parameters_index.into()); @@ -910,7 +1090,12 @@ pub fn batch_and_prepare_sorted_render_phase( gpu_array_buffer: ResMut>, mut indirect_parameters_buffers: ResMut, mut sorted_render_phases: ResMut>, - mut views: Query<(Entity, &ExtractedView, Has)>, + mut views: Query<( + Entity, + &ExtractedView, + Has, + Has, + )>, system_param_item: StaticSystemParam, ) where I: CachedRenderPipelinePhaseItem + SortedPhaseItem, @@ -920,20 +1105,25 @@ pub fn batch_and_prepare_sorted_render_phase( let BatchedInstanceBuffers { ref mut data_buffer, ref mut work_item_buffers, + ref mut late_indexed_indirect_parameters_buffer, + ref mut late_non_indexed_indirect_parameters_buffer, .. } = gpu_array_buffer.into_inner(); - for (view, extracted_view, no_indirect_drawing) in &mut views { + for (view, extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views { let Some(phase) = sorted_render_phases.get_mut(&extracted_view.retained_view_entity) else { continue; }; // Create the work item buffer if necessary. - let work_item_buffer = work_item_buffers - .entry(view) - .or_insert_with(TypeIdMap::default) - .entry(TypeId::of::()) - .or_insert_with(|| PreprocessWorkItemBuffers::new(no_indirect_drawing)); + let work_item_buffer = get_or_create_work_item_buffer::( + work_item_buffers, + view, + no_indirect_drawing, + gpu_occlusion_culling, + late_indexed_indirect_parameters_buffer, + late_non_indexed_indirect_parameters_buffer, + ); // Walk through the list of phase items, building up batches as we go. let mut batch: Option> = None; @@ -1056,7 +1246,15 @@ pub fn batch_and_prepare_binned_render_phase( gpu_array_buffer: ResMut>, mut indirect_parameters_buffers: ResMut, mut binned_render_phases: ResMut>, - mut views: Query<(Entity, &ExtractedView, Has)>, + mut views: Query< + ( + Entity, + &ExtractedView, + Has, + Has, + ), + With, + >, param: StaticSystemParam, ) where BPI: BinnedPhaseItem, @@ -1067,21 +1265,26 @@ pub fn batch_and_prepare_binned_render_phase( let BatchedInstanceBuffers { ref mut data_buffer, ref mut work_item_buffers, + ref mut late_indexed_indirect_parameters_buffer, + ref mut late_non_indexed_indirect_parameters_buffer, .. } = gpu_array_buffer.into_inner(); - for (view, extracted_view, no_indirect_drawing) in &mut views { + for (view, extracted_view, no_indirect_drawing, gpu_occlusion_culling) in &mut views { let Some(phase) = binned_render_phases.get_mut(&extracted_view.retained_view_entity) else { continue; }; // Create the work item buffer if necessary; otherwise, just mark it as // used this frame. - let work_item_buffer = work_item_buffers - .entry(view) - .or_insert_with(TypeIdMap::default) - .entry(TypeId::of::()) - .or_insert_with(|| PreprocessWorkItemBuffers::new(no_indirect_drawing)); + let work_item_buffer = get_or_create_work_item_buffer::( + work_item_buffers, + view, + no_indirect_drawing, + gpu_occlusion_culling, + late_indexed_indirect_parameters_buffer, + late_non_indexed_indirect_parameters_buffer, + ); // Prepare multidrawables. @@ -1125,10 +1328,9 @@ pub fn batch_and_prepare_binned_render_phase( // Start a new batch, in indirect mode. let indirect_parameters_index = indirect_parameters_buffers.allocate(batch_set_key.indexed(), 1); - let batch_set_index = NonMaxU32::new( - indirect_parameters_buffers.batch_set_count(batch_set_key.indexed()) - as u32, - ); + let batch_set_index = indirect_parameters_buffers + .get_next_batch_set_index(batch_set_key.indexed()); + GFBD::write_batch_indirect_parameters_metadata( input_index.into(), batch_set_key.indexed(), @@ -1234,9 +1436,9 @@ pub fn batch_and_prepare_binned_render_phase( // Start a new batch, in indirect mode. let indirect_parameters_index = indirect_parameters_buffers.allocate(key.0.indexed(), 1); - let batch_set_index = NonMaxU32::new( - indirect_parameters_buffers.batch_set_count(key.0.indexed()) as u32, - ); + let batch_set_index = + indirect_parameters_buffers.get_next_batch_set_index(key.0.indexed()); + GFBD::write_batch_indirect_parameters_metadata( input_index.into(), key.0.indexed(), @@ -1256,9 +1458,10 @@ pub fn batch_and_prepare_binned_render_phase( batch = Some(BinnedRenderPhaseBatch { representative_entity: (entity, main_entity), instance_range: output_index..output_index + 1, - extra_index: PhaseItemExtraIndex::maybe_indirect_parameters_index( - NonMaxU32::new(indirect_parameters_index), - ), + extra_index: PhaseItemExtraIndex::IndirectParametersIndex { + range: indirect_parameters_index..(indirect_parameters_index + 1), + batch_set_index: None, + }, }); } @@ -1396,6 +1599,8 @@ pub fn write_batched_instance_buffers( ref mut work_item_buffers, ref mut current_input_buffer, ref mut previous_input_buffer, + ref mut late_indexed_indirect_parameters_buffer, + ref mut late_non_indexed_indirect_parameters_buffer, } = gpu_array_buffer.into_inner(); data_buffer.write_buffer(&render_device); @@ -1405,6 +1610,8 @@ pub fn write_batched_instance_buffers( previous_input_buffer .buffer .write_buffer(&render_device, &render_queue); + late_indexed_indirect_parameters_buffer.write_buffer(&render_device, &render_queue); + late_non_indexed_indirect_parameters_buffer.write_buffer(&render_device, &render_queue); for view_work_item_buffers in work_item_buffers.values_mut() { for phase_work_item_buffers in view_work_item_buffers.values_mut() { @@ -1415,9 +1622,25 @@ pub fn write_batched_instance_buffers( PreprocessWorkItemBuffers::Indirect { ref mut indexed, ref mut non_indexed, + ref mut gpu_occlusion_culling, } => { indexed.write_buffer(&render_device, &render_queue); non_indexed.write_buffer(&render_device, &render_queue); + + if let Some(GpuOcclusionCullingWorkItemBuffers { + ref mut late_indexed, + ref mut late_non_indexed, + late_indirect_parameters_indexed_offset: _, + late_indirect_parameters_non_indexed_offset: _, + }) = *gpu_occlusion_culling + { + if !late_indexed.is_empty() { + late_indexed.write_buffer(&render_device); + } + if !late_non_indexed.is_empty() { + late_non_indexed.write_buffer(&render_device); + } + } } } } diff --git a/crates/bevy_render/src/experimental/mod.rs b/crates/bevy_render/src/experimental/mod.rs new file mode 100644 index 0000000000..40bb6cf1dc --- /dev/null +++ b/crates/bevy_render/src/experimental/mod.rs @@ -0,0 +1,6 @@ +//! Experimental rendering features. +//! +//! Experimental features are features with known problems, but are included +//! nonetheless for testing purposes. + +pub mod occlusion_culling; diff --git a/crates/bevy_render/src/experimental/occlusion_culling/mesh_preprocess_types.wgsl b/crates/bevy_render/src/experimental/occlusion_culling/mesh_preprocess_types.wgsl new file mode 100644 index 0000000000..7f4dd71f61 --- /dev/null +++ b/crates/bevy_render/src/experimental/occlusion_culling/mesh_preprocess_types.wgsl @@ -0,0 +1,60 @@ +// Types needed for GPU mesh uniform building. + +#define_import_path bevy_pbr::mesh_preprocess_types + +// Per-frame data that the CPU supplies to the GPU. +struct MeshInput { + // The model transform. + world_from_local: mat3x4, + // The lightmap UV rect, packed into 64 bits. + lightmap_uv_rect: vec2, + // Various flags. + flags: u32, + previous_input_index: u32, + first_vertex_index: u32, + first_index_index: u32, + index_count: u32, + current_skin_index: u32, + previous_skin_index: u32, + // Low 16 bits: index of the material inside the bind group data. + // High 16 bits: index of the lightmap in the binding array. + material_and_lightmap_bind_group_slot: u32, + pad_a: u32, + pad_b: u32, +} + +// The `wgpu` indirect parameters structure. This is a union of two structures. +// For more information, see the corresponding comment in +// `gpu_preprocessing.rs`. +struct IndirectParametersIndexed { + // `vertex_count` or `index_count`. + index_count: u32, + // `instance_count` in both structures. + instance_count: u32, + // `first_vertex` or `first_index`. + first_index: u32, + // `base_vertex` or `first_instance`. + base_vertex: u32, + // A read-only copy of `instance_index`. + first_instance: u32, +} + +struct IndirectParametersNonIndexed { + vertex_count: u32, + instance_count: u32, + base_vertex: u32, + first_instance: u32, +} + +struct IndirectParametersMetadata { + mesh_index: u32, + base_output_index: u32, + batch_set_index: u32, + early_instance_count: atomic, + late_instance_count: atomic, +} + +struct IndirectBatchSet { + indirect_parameters_count: atomic, + indirect_parameters_base: u32, +} diff --git a/crates/bevy_render/src/experimental/occlusion_culling/mod.rs b/crates/bevy_render/src/experimental/occlusion_culling/mod.rs new file mode 100644 index 0000000000..1c7fae1515 --- /dev/null +++ b/crates/bevy_render/src/experimental/occlusion_culling/mod.rs @@ -0,0 +1,87 @@ +//! GPU occlusion culling. +//! +//! See [`OcclusionCulling`] for a detailed description of occlusion culling in +//! Bevy. + +use bevy_app::{App, Plugin}; +use bevy_asset::{load_internal_asset, Handle}; +use bevy_ecs::{component::Component, prelude::ReflectComponent}; +use bevy_reflect::{prelude::ReflectDefault, Reflect}; + +use crate::{extract_component::ExtractComponent, render_resource::Shader}; + +/// The handle to the `mesh_preprocess_types.wgsl` compute shader. +pub const MESH_PREPROCESS_TYPES_SHADER_HANDLE: Handle = + Handle::weak_from_u128(2720440370122465935); + +/// Enables GPU occlusion culling. +/// +/// See [`OcclusionCulling`] for a detailed description of occlusion culling in +/// Bevy. +pub struct OcclusionCullingPlugin; + +impl Plugin for OcclusionCullingPlugin { + fn build(&self, app: &mut App) { + load_internal_asset!( + app, + MESH_PREPROCESS_TYPES_SHADER_HANDLE, + "mesh_preprocess_types.wgsl", + Shader::from_wgsl + ); + } +} + +/// Add this component to a view in order to enable experimental GPU occlusion +/// culling. +/// +/// *Bevy's occlusion culling is currently marked as experimental.* There are +/// known issues whereby, in rare circumstances, occlusion culling can result in +/// meshes being culled that shouldn't be (i.e. meshes that turn invisible). +/// Please try it out and report issues. +/// +/// *Occlusion culling* allows Bevy to avoid rendering objects that are fully +/// behind other opaque or alpha tested objects. This is different from, and +/// complements, depth fragment rejection as the `DepthPrepass` enables. While +/// depth rejection allows Bevy to avoid rendering *pixels* that are behind +/// other objects, the GPU still has to examine those pixels to reject them, +/// which requires transforming the vertices of the objects and performing +/// skinning if the objects were skinned. Occlusion culling allows the GPU to go +/// a step further, avoiding even transforming the vertices of objects that it +/// can quickly prove to be behind other objects. +/// +/// Occlusion culling inherently has some overhead, because Bevy must examine +/// the objects' bounding boxes, and create an acceleration structure +/// (hierarchical Z-buffer) to perform the occlusion tests. Therefore, occlusion +/// culling is disabled by default. Only enable it if you measure it to be a +/// speedup on your scene. Note that, because Bevy's occlusion culling runs on +/// the GPU and is quite efficient, it's rare for occlusion culling to result in +/// a significant slowdown. +/// +/// Occlusion culling currently requires a `DepthPrepass`. If no depth prepass +/// is present on the view, the [`OcclusionCulling`] component will be ignored. +/// Additionally, occlusion culling is currently incompatible with deferred +/// shading; including both `DeferredPrepass` and [`OcclusionCulling`] results +/// in unspecified behavior. +/// +/// The algorithm that Bevy uses is known as [*two-phase occlusion culling*]. +/// When you enable occlusion culling, Bevy splits the depth prepass into two: +/// an *early* depth prepass and a *late* depth prepass. The early depth prepass +/// renders all the meshes that were visible last frame to produce a +/// conservative approximation of the depth buffer. Then, after producing an +/// acceleration structure known as a hierarchical Z-buffer or depth pyramid, +/// Bevy tests the bounding boxes of all meshes against that depth buffer. Those +/// that can be quickly proven to be behind the geometry rendered during the +/// early depth prepass are skipped entirely. The other potentially-visible +/// meshes are rendered during the late prepass, and finally all the visible +/// meshes are rendered as usual during the opaque, transparent, etc. passes. +/// +/// Unlike other occlusion culling systems you may be familiar with, Bevy's +/// occlusion culling is fully dynamic and requires no baking step. The CPU +/// overhead is minimal. Large skinned meshes and other dynamic objects can +/// occlude other objects. +/// +/// [*two-phase occlusion culling*]: +/// https://medium.com/@mil_kru/two-pass-occlusion-culling-4100edcad501 +#[derive(Component, ExtractComponent, Clone, Copy, Default, Reflect)] +#[reflect(Component, Default)] +pub struct OcclusionCulling; diff --git a/crates/bevy_render/src/lib.rs b/crates/bevy_render/src/lib.rs index f98efcf1e7..0f9c504ede 100644 --- a/crates/bevy_render/src/lib.rs +++ b/crates/bevy_render/src/lib.rs @@ -23,6 +23,7 @@ pub mod alpha; pub mod batching; pub mod camera; pub mod diagnostic; +pub mod experimental; pub mod extract_component; pub mod extract_instances; mod extract_param; @@ -73,6 +74,7 @@ use bevy_utils::prelude::default; pub use extract_param::Extract; use bevy_window::{PrimaryWindow, RawHandleWrapperHolder}; +use experimental::occlusion_culling::OcclusionCullingPlugin; use extract_resource::ExtractResourcePlugin; use globals::GlobalsPlugin; use render_asset::RenderAssetBytesPerFrame; @@ -115,6 +117,12 @@ pub struct RenderPlugin { /// If `true`, disables asynchronous pipeline compilation. /// This has no effect on macOS, Wasm, iOS, or without the `multi_threaded` feature. pub synchronous_pipeline_compilation: bool, + /// If true, this sets the `COPY_SRC` flag on indirect draw parameters so + /// that they can be read back to CPU. + /// + /// This is a debugging feature that may reduce performance. It primarily + /// exists for the `occlusion_culling` example. + pub allow_copies_from_indirect_parameters: bool, } /// The systems sets of the default [`App`] rendering schedule. @@ -357,10 +365,13 @@ impl Plugin for RenderPlugin { MeshPlugin, GlobalsPlugin, MorphPlugin, - BatchingPlugin, + BatchingPlugin { + allow_copies_from_indirect_parameters: self.allow_copies_from_indirect_parameters, + }, SyncWorldPlugin, StoragePlugin, GpuReadbackPlugin::default(), + OcclusionCullingPlugin, )); app.init_resource::() diff --git a/crates/bevy_render/src/render_graph/context.rs b/crates/bevy_render/src/render_graph/context.rs index c27f269d0b..4c6ecd30c1 100644 --- a/crates/bevy_render/src/render_graph/context.rs +++ b/crates/bevy_render/src/render_graph/context.rs @@ -3,10 +3,10 @@ use crate::{ render_resource::{Buffer, Sampler, TextureView}, }; use alloc::borrow::Cow; -use bevy_ecs::entity::Entity; +use bevy_ecs::{entity::Entity, intern::Interned}; use thiserror::Error; -use super::{InternedRenderSubGraph, RenderSubGraph}; +use super::{InternedRenderSubGraph, RenderLabel, RenderSubGraph}; /// A command that signals the graph runner to run the sub graph corresponding to the `sub_graph` /// with the specified `inputs` next. @@ -224,6 +224,11 @@ impl<'a> RenderGraphContext<'a> { Ok(()) } + /// Returns a human-readable label for this node, for debugging purposes. + pub fn label(&self) -> Interned { + self.node.label + } + /// Finishes the context for this [`Node`](super::Node) by /// returning the sub graphs to run next. pub fn finish(self) -> Vec { diff --git a/crates/bevy_render/src/render_resource/buffer_vec.rs b/crates/bevy_render/src/render_resource/buffer_vec.rs index 8191671c15..d3f16ac341 100644 --- a/crates/bevy_render/src/render_resource/buffer_vec.rs +++ b/crates/bevy_render/src/render_resource/buffer_vec.rs @@ -202,6 +202,18 @@ impl RawBufferVec { } } +impl RawBufferVec +where + T: NoUninit + Default, +{ + pub fn grow_set(&mut self, index: u32, value: T) { + while index as usize + 1 > self.len() { + self.values.push(T::default()); + } + self.values[index as usize] = value; + } +} + impl Extend for RawBufferVec { #[inline] fn extend>(&mut self, iter: I) { diff --git a/crates/bevy_render/src/view/mod.rs b/crates/bevy_render/src/view/mod.rs index 57d7562c45..a3a5118c30 100644 --- a/crates/bevy_render/src/view/mod.rs +++ b/crates/bevy_render/src/view/mod.rs @@ -10,6 +10,7 @@ use crate::{ CameraMainTextureUsages, ClearColor, ClearColorConfig, Exposure, ExtractedCamera, ManualTextureViews, MipBias, NormalizedRenderTarget, TemporalJitter, }, + experimental::occlusion_culling::OcclusionCulling, extract_component::ExtractComponentPlugin, prelude::Shader, primitives::Frustum, @@ -109,9 +110,11 @@ impl Plugin for ViewPlugin { .register_type::() .register_type::() .register_type::() + .register_type::() // NOTE: windows.is_changed() handles cases where a window was resized .add_plugins(( ExtractComponentPlugin::::default(), + ExtractComponentPlugin::::default(), VisibilityPlugin, VisibilityRangePlugin, )); diff --git a/crates/bevy_sprite/src/mesh2d/material.rs b/crates/bevy_sprite/src/mesh2d/material.rs index 3566c74a5e..e44b27a005 100644 --- a/crates/bevy_sprite/src/mesh2d/material.rs +++ b/crates/bevy_sprite/src/mesh2d/material.rs @@ -17,6 +17,7 @@ use bevy_ecs::{ }; use bevy_math::FloatOrd; use bevy_reflect::{prelude::ReflectDefault, Reflect}; +use bevy_render::view::RenderVisibleEntities; use bevy_render::{ mesh::{MeshVertexBufferLayoutRef, RenderMesh}, render_asset::{ @@ -34,7 +35,7 @@ use bevy_render::{ }, renderer::RenderDevice, sync_world::{MainEntity, MainEntityHashMap}, - view::{ExtractedView, Msaa, RenderVisibleEntities, ViewVisibility}, + view::{ExtractedView, Msaa, ViewVisibility}, Extract, ExtractSchedule, Render, RenderApp, RenderSet, }; use core::{hash::Hash, marker::PhantomData}; diff --git a/crates/bevy_sprite/src/mesh2d/mesh.rs b/crates/bevy_sprite/src/mesh2d/mesh.rs index 52309fd492..86d02d5680 100644 --- a/crates/bevy_sprite/src/mesh2d/mesh.rs +++ b/crates/bevy_sprite/src/mesh2d/mesh.rs @@ -421,7 +421,8 @@ impl GetFullBatchData for Mesh2dPipeline { None => !0, Some(batch_set_index) => u32::from(batch_set_index), }, - instance_count: 0, + early_instance_count: 0, + late_instance_count: 0, }; if indexed { diff --git a/examples/3d/occlusion_culling.rs b/examples/3d/occlusion_culling.rs new file mode 100644 index 0000000000..11bdde698a --- /dev/null +++ b/examples/3d/occlusion_culling.rs @@ -0,0 +1,669 @@ +//! Demonstrates occlusion culling. +//! +//! This demo rotates many small cubes around a rotating large cube at the +//! origin. At all times, the large cube will be occluding several of the small +//! cubes. The demo displays the number of cubes that were actually rendered, so +//! the effects of occlusion culling can be seen. + +use std::{ + f32::consts::PI, + fmt::Write as _, + result::Result, + sync::{Arc, Mutex}, +}; + +use bevy::{ + color::palettes::css::{SILVER, WHITE}, + core_pipeline::{ + core_3d::graph::{Core3d, Node3d}, + prepass::DepthPrepass, + }, + prelude::*, + render::{ + batching::gpu_preprocessing::{ + GpuPreprocessingMode, GpuPreprocessingSupport, IndirectParametersBuffers, + IndirectParametersIndexed, + }, + experimental::occlusion_culling::OcclusionCulling, + render_graph::{self, NodeRunError, RenderGraphApp, RenderGraphContext, RenderLabel}, + render_resource::{Buffer, BufferDescriptor, BufferUsages, MapMode}, + renderer::{RenderAdapter, RenderContext, RenderDevice}, + settings::WgpuFeatures, + Render, RenderApp, RenderPlugin, RenderSet, + }, +}; +use bytemuck::Pod; + +/// The radius of the spinning sphere of cubes. +const OUTER_RADIUS: f32 = 3.0; + +/// The density of cubes in the other sphere. +const OUTER_SUBDIVISION_COUNT: u32 = 5; + +/// The speed at which the outer sphere and large cube rotate in radians per +/// frame. +const ROTATION_SPEED: f32 = 0.01; + +/// The length of each side of the small cubes, in meters. +const SMALL_CUBE_SIZE: f32 = 0.1; + +/// The length of each side of the large cube, in meters. +const LARGE_CUBE_SIZE: f32 = 2.0; + +/// A marker component for the immediate parent of the large sphere of cubes. +#[derive(Default, Component)] +struct SphereParent; + +/// A marker component for the large spinning cube at the origin. +#[derive(Default, Component)] +struct LargeCube; + +/// A plugin for the render app that reads the number of culled meshes from the +/// GPU back to the CPU. +struct ReadbackIndirectParametersPlugin; + +/// The node that we insert into the render graph in order to read the number of +/// culled meshes from the GPU back to the CPU. +#[derive(Default)] +struct ReadbackIndirectParametersNode; + +/// The [`RenderLabel`] that we use to identify the +/// [`ReadbackIndirectParametersNode`]. +#[derive(Clone, PartialEq, Eq, Hash, Debug, RenderLabel)] +struct ReadbackIndirectParameters; + +/// The intermediate staging buffers that we use to read back the indirect +/// parameters from the GPU to the CPU. +/// +/// We read back the GPU indirect parameters so that we can determine the number +/// of meshes that were culled. +/// +/// `wgpu` doesn't allow us to read indirect buffers back from the GPU to the +/// CPU directly. Instead, we have to copy them to a temporary staging buffer +/// first, and then read *those* buffers back from the GPU to the CPU. This +/// resource holds those temporary buffers. +#[derive(Resource, Default)] +struct IndirectParametersStagingBuffers { + /// The buffer that stores the indirect draw commands. + /// + /// See [`IndirectParametersIndexed`] for more information about the memory + /// layout of this buffer. + data: Option, + /// The buffer that stores the *number* of indirect draw commands. + /// + /// We only care about the first `u32` in this buffer. + batch_sets: Option, +} + +/// A resource, shared between the main world and the render world, that saves a +/// CPU-side copy of the GPU buffer that stores the indirect draw parameters. +/// +/// This is needed so that we can display the number of meshes that were culled. +/// It's reference counted, and protected by a lock, because we don't precisely +/// know when the GPU will be ready to present the CPU with the buffer copy. +/// Even though the rendering runs at least a frame ahead of the main app logic, +/// we don't require more precise synchronization than the lock because we don't +/// really care how up-to-date the counter of culled meshes is. If it's off by a +/// few frames, that's no big deal. +#[derive(Clone, Resource, Deref, DerefMut)] +struct SavedIndirectParameters(Arc>); + +/// A CPU-side copy of the GPU buffer that stores the indirect draw parameters. +/// +/// This is needed so that we can display the number of meshes that were culled. +struct SavedIndirectParametersData { + /// The CPU-side copy of the GPU buffer that stores the indirect draw + /// parameters. + data: Vec, + /// The CPU-side copy of the GPU buffer that stores the *number* of indirect + /// draw parameters that we have. + /// + /// All we care about is the number of indirect draw parameters for a single + /// view, so this is only one word in size. + count: u32, + /// True if occlusion culling is supported at all; false if it's not. + occlusion_culling_supported: bool, + /// True if we support inspecting the number of meshes that were culled on + /// this platform; false if we don't. + /// + /// If `multi_draw_indirect_count` isn't supported, then we would have to + /// employ a more complicated approach in order to determine the number of + /// meshes that are occluded, and that would be out of scope for this + /// example. + occlusion_culling_introspection_supported: bool, +} + +impl FromWorld for SavedIndirectParameters { + fn from_world(world: &mut World) -> SavedIndirectParameters { + let render_adapter = world.resource::(); + SavedIndirectParameters(Arc::new(Mutex::new(SavedIndirectParametersData { + data: vec![], + count: 0, + // This gets set to false in `readback_indirect_buffers` if we don't + // support GPU preprocessing. + occlusion_culling_supported: true, + // In order to determine how many meshes were culled, we look at the + // indirect count buffer that Bevy only populates if the platform + // supports `multi_draw_indirect_count`. So, if we don't have that + // feature, then we don't bother to display how many meshes were + // culled. + occlusion_culling_introspection_supported: render_adapter + .features() + .contains(WgpuFeatures::MULTI_DRAW_INDIRECT_COUNT), + }))) + } +} + +/// The demo's current settings. +#[derive(Resource)] +struct AppStatus { + /// Whether occlusion culling is presently enabled. + /// + /// By default, this is set to true. + occlusion_culling: bool, +} + +impl Default for AppStatus { + fn default() -> Self { + AppStatus { + occlusion_culling: true, + } + } +} + +fn main() { + App::new() + .add_plugins( + DefaultPlugins + .set(WindowPlugin { + primary_window: Some(Window { + title: "Bevy Occlusion Culling Example".into(), + ..default() + }), + ..default() + }) + .set(RenderPlugin { + allow_copies_from_indirect_parameters: true, + ..default() + }), + ) + .add_plugins(ReadbackIndirectParametersPlugin) + .init_resource::() + .add_systems(Startup, setup) + .add_systems(Update, spin_small_cubes) + .add_systems(Update, spin_large_cube) + .add_systems(Update, update_status_text) + .add_systems(Update, toggle_occlusion_culling_on_request) + .run(); +} + +impl Plugin for ReadbackIndirectParametersPlugin { + fn build(&self, app: &mut App) { + // Fetch the render app. + let Some(render_app) = app.get_sub_app_mut(RenderApp) else { + return; + }; + + render_app + .init_resource::() + .add_systems(ExtractSchedule, readback_indirect_parameters) + .add_systems( + Render, + create_indirect_parameters_staging_buffers.in_set(RenderSet::PrepareResourcesFlush), + ) + // Add the node that allows us to read the indirect parameters back + // from the GPU to the CPU, which allows us to determine how many + // meshes were culled. + .add_render_graph_node::( + Core3d, + ReadbackIndirectParameters, + ) + // We read back the indirect parameters any time after + // `EndMainPass`. Readback doesn't particularly need to execute + // before `EndMainPassPostProcessing`, but we specify that anyway + // because we want to make the indirect parameters run before + // *something* in the graph, and `EndMainPassPostProcessing` is a + // good a node as any other. + .add_render_graph_edges( + Core3d, + ( + Node3d::EndMainPass, + ReadbackIndirectParameters, + Node3d::EndMainPassPostProcessing, + ), + ); + } + + fn finish(&self, app: &mut App) { + // Create the `SavedIndirectParameters` resource that we're going to use + // to communicate between the thread that the GPU-to-CPU readback + // callback runs on and the main application threads. This resource is + // atomically reference counted. We store one reference to the + // `SavedIndirectParameters` in the main app and another reference in + // the render app. + let saved_indirect_parameters = SavedIndirectParameters::from_world(app.world_mut()); + app.insert_resource(saved_indirect_parameters.clone()); + + // Fetch the render app. + let Some(render_app) = app.get_sub_app_mut(RenderApp) else { + return; + }; + + render_app + // Insert another reference to the `SavedIndirectParameters`. + .insert_resource(saved_indirect_parameters); + } +} + +/// Spawns all the objects in the scene. +fn setup( + mut commands: Commands, + asset_server: Res, + mut meshes: ResMut>, + mut materials: ResMut>, +) { + spawn_small_cubes(&mut commands, &mut meshes, &mut materials); + spawn_large_cube(&mut commands, &asset_server, &mut meshes, &mut materials); + spawn_light(&mut commands); + spawn_camera(&mut commands); + spawn_help_text(&mut commands); +} + +/// Spawns the rotating sphere of small cubes. +fn spawn_small_cubes( + commands: &mut Commands, + meshes: &mut Assets, + materials: &mut Assets, +) { + // Add the cube mesh. + let small_cube = meshes.add(Cuboid::new( + SMALL_CUBE_SIZE, + SMALL_CUBE_SIZE, + SMALL_CUBE_SIZE, + )); + + // Add the cube material. + let small_cube_material = materials.add(StandardMaterial { + base_color: SILVER.into(), + ..default() + }); + + // Create the entity that the small cubes will be parented to. This is the + // entity that we rotate. + let sphere_parent = commands + .spawn(Transform::from_translation(Vec3::ZERO)) + .insert(Visibility::default()) + .insert(SphereParent) + .id(); + + // Now we have to figure out where to place the cubes. To do that, we create + // a sphere mesh, but we don't add it to the scene. Instead, we inspect the + // sphere mesh to find the positions of its vertices, and spawn a small cube + // at each one. That way, we end up with a bunch of cubes arranged in a + // spherical shape. + + // Create the sphere mesh, and extract the positions of its vertices. + let sphere = Sphere::new(OUTER_RADIUS) + .mesh() + .ico(OUTER_SUBDIVISION_COUNT) + .unwrap(); + let sphere_positions = sphere.attribute(Mesh::ATTRIBUTE_POSITION).unwrap(); + + // At each vertex, create a small cube. + for sphere_position in sphere_positions.as_float3().unwrap() { + let sphere_position = Vec3::from_slice(sphere_position); + let small_cube = commands + .spawn(Mesh3d(small_cube.clone())) + .insert(MeshMaterial3d(small_cube_material.clone())) + .insert(Transform::from_translation(sphere_position)) + .id(); + commands.entity(sphere_parent).add_child(small_cube); + } +} + +/// Spawns the large cube at the center of the screen. +/// +/// This cube rotates chaotically and occludes small cubes behind it. +fn spawn_large_cube( + commands: &mut Commands, + asset_server: &AssetServer, + meshes: &mut Assets, + materials: &mut Assets, +) { + commands + .spawn(Mesh3d(meshes.add(Cuboid::new( + LARGE_CUBE_SIZE, + LARGE_CUBE_SIZE, + LARGE_CUBE_SIZE, + )))) + .insert(MeshMaterial3d(materials.add(StandardMaterial { + base_color: WHITE.into(), + base_color_texture: Some(asset_server.load("branding/icon.png")), + ..default() + }))) + .insert(Transform::IDENTITY) + .insert(LargeCube); +} + +// Spins the outer sphere a bit every frame. +// +// This ensures that the set of cubes that are hidden and shown varies over +// time. +fn spin_small_cubes(mut sphere_parents: Query<&mut Transform, With>) { + for mut sphere_parent_transform in &mut sphere_parents { + sphere_parent_transform.rotate_y(ROTATION_SPEED); + } +} + +/// Spins the large cube a bit every frame. +/// +/// The chaotic rotation adds a bit of randomness to the scene to better +/// demonstrate the dynamicity of the occlusion culling. +fn spin_large_cube(mut large_cubes: Query<&mut Transform, With>) { + for mut transform in &mut large_cubes { + transform.rotate(Quat::from_euler( + EulerRot::XYZ, + 0.13 * ROTATION_SPEED, + 0.29 * ROTATION_SPEED, + 0.35 * ROTATION_SPEED, + )); + } +} + +/// Spawns a directional light to illuminate the scene. +fn spawn_light(commands: &mut Commands) { + commands + .spawn(DirectionalLight::default()) + .insert(Transform::from_rotation(Quat::from_euler( + EulerRot::ZYX, + 0.0, + PI * -0.15, + PI * -0.15, + ))); +} + +/// Spawns a camera that includes the depth prepass and occlusion culling. +fn spawn_camera(commands: &mut Commands) { + commands + .spawn(Camera3d::default()) + .insert(Transform::from_xyz(0.0, 0.0, 9.0).looking_at(Vec3::ZERO, Vec3::Y)) + .insert(DepthPrepass) + .insert(OcclusionCulling); +} + +/// Spawns the help text at the upper left of the screen. +fn spawn_help_text(commands: &mut Commands) { + commands.spawn(( + Text::new(""), + Node { + position_type: PositionType::Absolute, + top: Val::Px(12.0), + left: Val::Px(12.0), + ..default() + }, + )); +} + +impl render_graph::Node for ReadbackIndirectParametersNode { + fn run<'w>( + &self, + _: &mut RenderGraphContext, + render_context: &mut RenderContext<'w>, + world: &'w World, + ) -> Result<(), NodeRunError> { + // Extract the buffers that hold the GPU indirect draw parameters from + // the world resources. We're going to read those buffers to determine + // how many meshes were actually drawn. + let (Some(indirect_parameters_buffers), Some(indirect_parameters_mapping_buffers)) = ( + world.get_resource::(), + world.get_resource::(), + ) else { + return Ok(()); + }; + + // Grab both the buffers we're copying from and the staging buffers + // we're copying to. Remember that we can't map the indirect parameters + // buffers directly, so we have to copy their contents to a staging + // buffer. + let ( + Some(indexed_data_buffer), + Some(indexed_batch_sets_buffer), + Some(indirect_parameters_staging_data_buffer), + Some(indirect_parameters_staging_batch_sets_buffer), + ) = ( + indirect_parameters_buffers.indexed_data_buffer(), + indirect_parameters_buffers.indexed_batch_sets_buffer(), + indirect_parameters_mapping_buffers.data.as_ref(), + indirect_parameters_mapping_buffers.batch_sets.as_ref(), + ) + else { + return Ok(()); + }; + + // Copy from the indirect parameters buffers to the staging buffers. + render_context.command_encoder().copy_buffer_to_buffer( + indexed_data_buffer, + 0, + indirect_parameters_staging_data_buffer, + 0, + indexed_data_buffer.size(), + ); + render_context.command_encoder().copy_buffer_to_buffer( + indexed_batch_sets_buffer, + 0, + indirect_parameters_staging_batch_sets_buffer, + 0, + indexed_batch_sets_buffer.size(), + ); + + Ok(()) + } +} + +/// Creates the staging buffers that we use to read back the indirect parameters +/// from the GPU to the CPU. +/// +/// We read the indirect parameters from the GPU to the CPU in order to display +/// the number of meshes that were culled each frame. +/// +/// We need these staging buffers because `wgpu` doesn't allow us to read the +/// contents of the indirect parameters buffers directly. We must first copy +/// them from the GPU to a staging buffer, and then read the staging buffer. +fn create_indirect_parameters_staging_buffers( + mut indirect_parameters_staging_buffers: ResMut, + indirect_parameters_buffers: Res, + render_device: Res, +) { + // Fetch the indirect parameters buffers that we're going to copy from. + let (Some(indexed_data_buffer), Some(indexed_batch_set_buffer)) = ( + indirect_parameters_buffers.indexed_data_buffer(), + indirect_parameters_buffers.indexed_batch_sets_buffer(), + ) else { + return; + }; + + // Build the staging buffers. Make sure they have the same sizes as the + // buffers we're copying from. + indirect_parameters_staging_buffers.data = + Some(render_device.create_buffer(&BufferDescriptor { + label: Some("indexed data staging buffer"), + size: indexed_data_buffer.size(), + usage: BufferUsages::MAP_READ | BufferUsages::COPY_DST, + mapped_at_creation: false, + })); + indirect_parameters_staging_buffers.batch_sets = + Some(render_device.create_buffer(&BufferDescriptor { + label: Some("indexed batch set staging buffer"), + size: indexed_batch_set_buffer.size(), + usage: BufferUsages::MAP_READ | BufferUsages::COPY_DST, + mapped_at_creation: false, + })); +} + +/// Updates the app status text at the top of the screen. +fn update_status_text( + saved_indirect_parameters: Res, + mut texts: Query<&mut Text>, + meshes: Query>, + app_status: Res, +) { + // How many meshes are in the scene? + let total_mesh_count = meshes.iter().count(); + + // Sample the rendered object count. Note that we don't synchronize beyond + // locking the data and therefore this will value will generally at least + // one frame behind. This is fine; this app is just a demonstration after + // all. + let ( + rendered_object_count, + occlusion_culling_supported, + occlusion_culling_introspection_supported, + ): (u32, bool, bool) = { + let saved_indirect_parameters = saved_indirect_parameters.lock().unwrap(); + ( + saved_indirect_parameters + .data + .iter() + .take(saved_indirect_parameters.count as usize) + .map(|indirect_parameters| indirect_parameters.instance_count) + .sum(), + saved_indirect_parameters.occlusion_culling_supported, + saved_indirect_parameters.occlusion_culling_introspection_supported, + ) + }; + + // Change the text. + for mut text in &mut texts { + text.0 = String::new(); + if !occlusion_culling_supported { + text.0 + .push_str("Occlusion culling not supported on this platform"); + continue; + } + + let _ = writeln!( + &mut text.0, + "Occlusion culling {} (Press Space to toggle)", + if app_status.occlusion_culling { + "ON" + } else { + "OFF" + }, + ); + + if !occlusion_culling_introspection_supported { + continue; + } + + let _ = write!( + &mut text.0, + "{}/{} meshes rendered", + rendered_object_count, total_mesh_count + ); + } +} + +/// A system that reads the indirect parameters back from the GPU so that we can +/// report how many meshes were culled. +fn readback_indirect_parameters( + mut indirect_parameters_staging_buffers: ResMut, + saved_indirect_parameters: Res, + gpu_preprocessing_support: Res, +) { + // If culling isn't supported on this platform, note that, and bail. + if gpu_preprocessing_support.max_supported_mode != GpuPreprocessingMode::Culling { + saved_indirect_parameters + .lock() + .unwrap() + .occlusion_culling_supported = false; + return; + } + + // Grab the staging buffers. + let (Some(data_buffer), Some(batch_sets_buffer)) = ( + indirect_parameters_staging_buffers.data.take(), + indirect_parameters_staging_buffers.batch_sets.take(), + ) else { + return; + }; + + // Read the GPU buffers back. + let saved_indirect_parameters_0 = (**saved_indirect_parameters).clone(); + let saved_indirect_parameters_1 = (**saved_indirect_parameters).clone(); + readback_buffer::(data_buffer, move |indirect_parameters| { + saved_indirect_parameters_0.lock().unwrap().data = indirect_parameters.to_vec(); + }); + readback_buffer::(batch_sets_buffer, move |indirect_parameters_count| { + saved_indirect_parameters_1.lock().unwrap().count = indirect_parameters_count[0]; + }); +} + +// A helper function to asynchronously read an array of [`Pod`] values back from +// the GPU to the CPU. +// +// The given callback is invoked when the data is ready. The buffer will +// automatically be unmapped after the callback executes. +fn readback_buffer(buffer: Buffer, callback: impl FnOnce(&[T]) + Send + 'static) +where + T: Pod, +{ + // We need to make another reference to the buffer so that we can move the + // original reference into the closure below. + let original_buffer = buffer.clone(); + original_buffer + .slice(..) + .map_async(MapMode::Read, move |result| { + // Make sure we succeeded. + if result.is_err() { + return; + } + + { + // Cast the raw bytes in the GPU buffer to the appropriate type. + let buffer_view = buffer.slice(..).get_mapped_range(); + let indirect_parameters: &[T] = bytemuck::cast_slice( + &buffer_view[0..(buffer_view.len() / size_of::() * size_of::())], + ); + + // Invoke the callback. + callback(indirect_parameters); + } + + // Unmap the buffer. We have to do this before submitting any more + // GPU command buffers, or `wgpu` will assert. + buffer.unmap(); + }); +} + +/// Adds or removes the [`OcclusionCulling`] and [`DepthPrepass`] components +/// when the user presses the spacebar. +fn toggle_occlusion_culling_on_request( + mut commands: Commands, + input: Res>, + mut app_status: ResMut, + cameras: Query>, +) { + // Only run when the user presses the spacebar. + if !input.just_pressed(KeyCode::Space) { + return; + } + + // Toggle the occlusion culling flag in `AppStatus`. + app_status.occlusion_culling = !app_status.occlusion_culling; + + // Add or remove the `OcclusionCulling` and `DepthPrepass` components as + // requested. + for camera in &cameras { + if app_status.occlusion_culling { + commands + .entity(camera) + .insert(DepthPrepass) + .insert(OcclusionCulling); + } else { + commands + .entity(camera) + .remove::() + .remove::(); + } + } +} diff --git a/examples/README.md b/examples/README.md index 8963590365..858d5b3f59 100644 --- a/examples/README.md +++ b/examples/README.md @@ -164,6 +164,7 @@ Example | Description [Meshlet](../examples/3d/meshlet.rs) | Meshlet rendering for dense high-poly scenes (experimental) [Mixed lighting](../examples/3d/mixed_lighting.rs) | Demonstrates how to combine baked and dynamic lighting [Motion Blur](../examples/3d/motion_blur.rs) | Demonstrates per-pixel motion blur +[Occlusion Culling](../examples/3d/occlusion_culling.rs) | Demonstration of Occlusion Culling [Order Independent Transparency](../examples/3d/order_independent_transparency.rs) | Demonstrates how to use OIT [Orthographic View](../examples/3d/orthographic.rs) | Shows how to create a 3D orthographic view (for isometric-look in games or CAD applications) [Parallax Mapping](../examples/3d/parallax_mapping.rs) | Demonstrates use of a normal map and depth map for parallax mapping diff --git a/examples/shader/specialized_mesh_pipeline.rs b/examples/shader/specialized_mesh_pipeline.rs index bc1ecf113c..9cc414e26a 100644 --- a/examples/shader/specialized_mesh_pipeline.rs +++ b/examples/shader/specialized_mesh_pipeline.rs @@ -6,8 +6,6 @@ //! //! [`SpecializedMeshPipeline`] let's you customize the entire pipeline used when rendering a mesh. -use std::any::TypeId; - use bevy::{ core_pipeline::core_3d::{Opaque3d, Opaque3dBatchSetKey, Opaque3dBinKey, CORE_3D_DEPTH_FORMAT}, ecs::system::StaticSystemParam, @@ -21,11 +19,11 @@ use bevy::{ batching::GetFullBatchData, batching::{ gpu_preprocessing::{ - BatchedInstanceBuffers, IndirectParametersBuffers, PreprocessWorkItem, - PreprocessWorkItemBuffers, + self, BatchedInstanceBuffers, IndirectParametersBuffers, PreprocessWorkItem, }, GetBatchData, }, + experimental::occlusion_culling::OcclusionCulling, extract_component::{ExtractComponent, ExtractComponentPlugin}, mesh::{Indices, MeshVertexBufferLayoutRef, PrimitiveTopology, RenderMesh}, render_asset::{RenderAssetUsages, RenderAssets}, @@ -43,7 +41,6 @@ use bevy::{ view::{self, ExtractedView, RenderVisibleEntities, ViewTarget, VisibilityClass}, Render, RenderApp, RenderSet, }, - utils::TypeIdMap, }; const SHADER_ASSET_PATH: &str = "shaders/specialized_mesh_pipeline.wgsl"; @@ -288,6 +285,7 @@ fn queue_custom_mesh_pipeline( &ExtractedView, &Msaa, Has, + Has, )>, (render_meshes, render_mesh_instances): ( Res>, @@ -307,6 +305,8 @@ fn queue_custom_mesh_pipeline( let BatchedInstanceBuffers { ref mut data_buffer, ref mut work_item_buffers, + ref mut late_indexed_indirect_parameters_buffer, + ref mut late_non_indexed_indirect_parameters_buffer, .. } = gpu_array_buffer.into_inner(); @@ -318,7 +318,15 @@ fn queue_custom_mesh_pipeline( // Render phases are per-view, so we need to iterate over all views so that // the entity appears in them. (In this example, we have only one view, but // it's good practice to loop over all views anyway.) - for (view_entity, view_visible_entities, view, msaa, no_indirect_drawing) in views.iter() { + for ( + view_entity, + view_visible_entities, + view, + msaa, + no_indirect_drawing, + gpu_occlusion_culling, + ) in views.iter() + { let Some(opaque_phase) = opaque_render_phases.get_mut(&view.retained_view_entity) else { continue; }; @@ -326,11 +334,14 @@ fn queue_custom_mesh_pipeline( // Create a *work item buffer* if necessary. Work item buffers store the // indices of meshes that are to be rendered when indirect drawing is // enabled. - let work_item_buffer = work_item_buffers - .entry(view_entity) - .or_insert_with(TypeIdMap::default) - .entry(TypeId::of::()) - .or_insert_with(|| PreprocessWorkItemBuffers::new(no_indirect_drawing)); + let work_item_buffer = gpu_preprocessing::get_or_create_work_item_buffer::( + work_item_buffers, + view_entity, + no_indirect_drawing, + gpu_occlusion_culling, + late_indexed_indirect_parameters_buffer, + late_non_indexed_indirect_parameters_buffer, + ); // Create the key based on the view. In this case we only care about MSAA and HDR let view_key = MeshPipelineKey::from_msaa_samples(msaa.samples()) diff --git a/examples/tools/scene_viewer/main.rs b/examples/tools/scene_viewer/main.rs index af8272a13b..6935798169 100644 --- a/examples/tools/scene_viewer/main.rs +++ b/examples/tools/scene_viewer/main.rs @@ -4,12 +4,19 @@ //! replacing the path as appropriate. //! In case of multiple scenes, you can select which to display by adapting the file path: `/path/to/model.gltf#Scene1`. //! With no arguments it will load the `FlightHelmet` glTF model from the repository assets subdirectory. +//! Pass `--help` to see all the supported arguments. //! //! If you want to hot reload asset changes, enable the `file_watcher` cargo feature. +use argh::FromArgs; use bevy::{ + core_pipeline::prepass::{DeferredPrepass, DepthPrepass}, + pbr::DefaultOpaqueRendererMethod, prelude::*, - render::primitives::{Aabb, Sphere}, + render::{ + experimental::occlusion_culling::OcclusionCulling, + primitives::{Aabb, Sphere}, + }, }; #[path = "../../helpers/camera_controller.rs"] @@ -24,7 +31,34 @@ use camera_controller::{CameraController, CameraControllerPlugin}; use morph_viewer_plugin::MorphViewerPlugin; use scene_viewer_plugin::{SceneHandle, SceneViewerPlugin}; +/// A simple glTF scene viewer made with Bevy +#[derive(FromArgs, Resource)] +struct Args { + /// the path to the glTF scene + #[argh( + positional, + default = "\"assets/models/FlightHelmet/FlightHelmet.gltf\".to_string()" + )] + scene_path: String, + /// enable a depth prepass + #[argh(switch)] + depth_prepass: Option, + /// enable occlusion culling + #[argh(switch)] + occlusion_culling: Option, + /// enable deferred shading + #[argh(switch)] + deferred: Option, +} + fn main() { + #[cfg(not(target_arch = "wasm32"))] + let args: Args = argh::from_env(); + #[cfg(target_arch = "wasm32")] + let args: Args = Args::from_args(&[], &[]).unwrap(); + + let deferred = args.deferred; + let mut app = App::new(); app.add_plugins(( DefaultPlugins @@ -43,9 +77,15 @@ fn main() { SceneViewerPlugin, MorphViewerPlugin, )) + .insert_resource(args) .add_systems(Startup, setup) .add_systems(PreUpdate, setup_scene_after_load); + // If deferred shading was requested, turn it on. + if deferred == Some(true) { + app.insert_resource(DefaultOpaqueRendererMethod::deferred()); + } + #[cfg(feature = "animation")] app.add_plugins(animation_plugin::AnimationManipulationPlugin); @@ -67,12 +107,10 @@ fn parse_scene(scene_path: String) -> (String, usize) { (scene_path, 0) } -fn setup(mut commands: Commands, asset_server: Res) { - let scene_path = std::env::args() - .nth(1) - .unwrap_or_else(|| "assets/models/FlightHelmet/FlightHelmet.gltf".to_string()); +fn setup(mut commands: Commands, asset_server: Res, args: Res) { + let scene_path = &args.scene_path; info!("Loading {}", scene_path); - let (file_path, scene_index) = parse_scene(scene_path); + let (file_path, scene_index) = parse_scene((*scene_path).clone()); commands.insert_resource(SceneHandle::new(asset_server.load(file_path), scene_index)); } @@ -82,6 +120,7 @@ fn setup_scene_after_load( mut setup: Local, mut scene_handle: ResMut, asset_server: Res, + args: Res, meshes: Query<(&GlobalTransform, Option<&Aabb>), With>, ) { if scene_handle.is_loaded && !*setup { @@ -125,7 +164,7 @@ fn setup_scene_after_load( info!("{}", camera_controller); info!("{}", *scene_handle); - commands.spawn(( + let mut camera = commands.spawn(( Camera3d::default(), Projection::from(projection), Transform::from_translation(Vec3::from(aabb.center) + size * Vec3::new(0.5, 0.25, 0.5)) @@ -145,6 +184,25 @@ fn setup_scene_after_load( camera_controller, )); + // If occlusion culling was requested, include the relevant components. + // The Z-prepass is currently required. + if args.occlusion_culling == Some(true) { + camera.insert((DepthPrepass, OcclusionCulling)); + } + + // If the depth prepass was requested, include it. + if args.depth_prepass == Some(true) { + camera.insert(DepthPrepass); + } + + // If deferred shading was requested, include the prepass. + if args.deferred == Some(true) { + camera + .insert(Msaa::Off) + .insert(DepthPrepass) + .insert(DeferredPrepass); + } + // Spawn a default light if the scene does not have one if !scene_handle.has_light { info!("Spawning a directional light");