Meshlet fill cluster buffers rewritten (#15955)

# Objective
- Make the meshlet fill cluster buffers pass slightly faster
- Address https://github.com/bevyengine/bevy/issues/15920 for meshlets
- Added PreviousGlobalTransform as a required meshlet component to avoid
extra archetype moves, slightly alleviating
https://github.com/bevyengine/bevy/issues/14681 for meshlets
- Enforce that MeshletPlugin::cluster_buffer_slots is not greater than
2^25 (glitches will occur otherwise). Technically this field controls
post-lod/culling cluster count, and the issue is on pre-lod/culling
cluster count, but it's still valid now, and in the future this will be
more true.

Needs to be merged after https://github.com/bevyengine/bevy/pull/15846
and https://github.com/bevyengine/bevy/pull/15886

## Solution

- Old pass dispatched a thread per cluster, and did a binary search over
the instances to find which instance the cluster belongs to, and what
meshlet index within the instance it is.
- New pass dispatches a workgroup per instance, and has the workgroup
loop over all meshlets in the instance in order to write out the cluster
data.
- Use a push constant instead of arrayLength to fix the linked bug
- Remap 1d->2d dispatch for software raster only if actually needed to
save on spawning excess workgroups

## Testing

- Did you test these changes? If so, how?
- Ran the meshlet example, and an example with 1041 instances of 32217
meshlets per instance. Profiled the second scene with nsight, went from
0.55ms -> 0.40ms. Small savings. We're pretty much VRAM bandwidth bound
at this point.
- How can other people (reviewers) test your changes? Is there anything
specific they need to know?
  - Run the meshlet example

## Changelog (non-meshlets)
- PreviousGlobalTransform now implements the Default trait
This commit is contained in:
JMS55 2024-10-23 12:18:49 -07:00 committed by GitHub
parent 6d42830c7f
commit 3fb6cefb2f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 139 additions and 82 deletions

View File

@ -13,7 +13,7 @@
meshlet_software_raster_indirect_args, meshlet_software_raster_indirect_args,
meshlet_hardware_raster_indirect_args, meshlet_hardware_raster_indirect_args,
meshlet_raster_clusters, meshlet_raster_clusters,
meshlet_raster_cluster_rightmost_slot, constants,
MeshletBoundingSphere, MeshletBoundingSphere,
} }
#import bevy_render::maths::affine3_to_square #import bevy_render::maths::affine3_to_square
@ -32,7 +32,7 @@ fn cull_clusters(
) { ) {
// Calculate the cluster ID for this thread // Calculate the cluster ID for this thread
let cluster_id = local_invocation_index + 128u * dot(workgroup_id, vec3(num_workgroups.x * num_workgroups.x, num_workgroups.x, 1u)); let cluster_id = local_invocation_index + 128u * dot(workgroup_id, vec3(num_workgroups.x * num_workgroups.x, num_workgroups.x, 1u));
if cluster_id >= arrayLength(&meshlet_cluster_meshlet_ids) { return; } if cluster_id >= constants.scene_cluster_count { return; }
#ifdef MESHLET_SECOND_CULLING_PASS #ifdef MESHLET_SECOND_CULLING_PASS
if !cluster_is_second_pass_candidate(cluster_id) { return; } if !cluster_is_second_pass_candidate(cluster_id) { return; }
@ -138,7 +138,7 @@ fn cull_clusters(
} else { } else {
// Append this cluster to the list for hardware rasterization // Append this cluster to the list for hardware rasterization
buffer_slot = atomicAdd(&meshlet_hardware_raster_indirect_args.instance_count, 1u); buffer_slot = atomicAdd(&meshlet_hardware_raster_indirect_args.instance_count, 1u);
buffer_slot = meshlet_raster_cluster_rightmost_slot - buffer_slot; buffer_slot = constants.meshlet_raster_cluster_rightmost_slot - buffer_slot;
} }
meshlet_raster_clusters[buffer_slot] = cluster_id; meshlet_raster_clusters[buffer_slot] = cluster_id;
} }

View File

@ -1,6 +1,7 @@
#import bevy_pbr::meshlet_bindings::{ #import bevy_pbr::meshlet_bindings::{
cluster_count, scene_instance_count,
meshlet_instance_meshlet_counts_prefix_sum, meshlet_global_cluster_count,
meshlet_instance_meshlet_counts,
meshlet_instance_meshlet_slice_starts, meshlet_instance_meshlet_slice_starts,
meshlet_cluster_instance_ids, meshlet_cluster_instance_ids,
meshlet_cluster_meshlet_ids, meshlet_cluster_meshlet_ids,
@ -8,37 +9,42 @@
/// Writes out instance_id and meshlet_id to the global buffers for each cluster in the scene. /// Writes out instance_id and meshlet_id to the global buffers for each cluster in the scene.
var<workgroup> cluster_slice_start_workgroup: u32;
@compute @compute
@workgroup_size(128, 1, 1) // 128 threads per workgroup, 1 cluster per thread @workgroup_size(1024, 1, 1) // 1024 threads per workgroup, 1 instance per workgroup
fn fill_cluster_buffers( fn fill_cluster_buffers(
@builtin(workgroup_id) workgroup_id: vec3<u32>, @builtin(workgroup_id) workgroup_id: vec3<u32>,
@builtin(num_workgroups) num_workgroups: vec3<u32>, @builtin(num_workgroups) num_workgroups: vec3<u32>,
@builtin(local_invocation_index) local_invocation_index: u32, @builtin(local_invocation_index) local_invocation_index: u32,
) { ) {
// Calculate the cluster ID for this thread // Calculate the instance ID for this workgroup
let cluster_id = local_invocation_index + 128u * dot(workgroup_id, vec3(num_workgroups.x * num_workgroups.x, num_workgroups.x, 1u)); var instance_id = workgroup_id.x + (workgroup_id.y * num_workgroups.x);
if cluster_id >= cluster_count { return; } // TODO: Could be an arrayLength? if instance_id >= scene_instance_count { return; }
// Binary search to find the instance this cluster belongs to let instance_meshlet_count = meshlet_instance_meshlet_counts[instance_id];
var left = 0u; let instance_meshlet_slice_start = meshlet_instance_meshlet_slice_starts[instance_id];
var right = arrayLength(&meshlet_instance_meshlet_counts_prefix_sum) - 1u;
while left <= right { // Reserve cluster slots for the instance and broadcast to the workgroup
let mid = (left + right) / 2u; if local_invocation_index == 0u {
if meshlet_instance_meshlet_counts_prefix_sum[mid] <= cluster_id { cluster_slice_start_workgroup = atomicAdd(&meshlet_global_cluster_count, instance_meshlet_count);
left = mid + 1u;
} else {
right = mid - 1u;
}
} }
let instance_id = right; let cluster_slice_start = workgroupUniformLoad(&cluster_slice_start_workgroup);
// Find the meshlet ID for this cluster within the instance's MeshletMesh // Loop enough times to write out all the meshlets for the instance given that each thread writes 1 meshlet in each iteration
let meshlet_id_local = cluster_id - meshlet_instance_meshlet_counts_prefix_sum[instance_id]; for (var clusters_written = 0u; clusters_written < instance_meshlet_count; clusters_written += 1024u) {
// Calculate meshlet ID within this instance's MeshletMesh to process for this thread
let meshlet_id_local = clusters_written + local_invocation_index;
if meshlet_id_local >= instance_meshlet_count { return; }
// Find the overall meshlet ID in the global meshlet buffer // Find the overall cluster ID in the global cluster buffer
let meshlet_id = meshlet_id_local + meshlet_instance_meshlet_slice_starts[instance_id]; let cluster_id = cluster_slice_start + meshlet_id_local;
// Write results to buffers // Find the overall meshlet ID in the global meshlet buffer
meshlet_cluster_instance_ids[cluster_id] = instance_id; let meshlet_id = instance_meshlet_slice_start + meshlet_id_local;
meshlet_cluster_meshlet_ids[cluster_id] = meshlet_id;
// Write results to buffers
meshlet_cluster_instance_ids[cluster_id] = instance_id;
meshlet_cluster_meshlet_ids[cluster_id] = meshlet_id;
}
} }

View File

@ -10,8 +10,9 @@ use bevy_ecs::{
query::Has, query::Has,
system::{Local, Query, Res, ResMut, Resource, SystemState}, system::{Local, Query, Res, ResMut, Resource, SystemState},
}; };
use bevy_render::sync_world::MainEntity; use bevy_render::{
use bevy_render::{render_resource::StorageBuffer, view::RenderLayers, MainWorld}; render_resource::StorageBuffer, sync_world::MainEntity, view::RenderLayers, MainWorld,
};
use bevy_transform::components::GlobalTransform; use bevy_transform::components::GlobalTransform;
use bevy_utils::{HashMap, HashSet}; use bevy_utils::{HashMap, HashSet};
use core::ops::{DerefMut, Range}; use core::ops::{DerefMut, Range};
@ -19,33 +20,36 @@ use core::ops::{DerefMut, Range};
/// Manages data for each entity with a [`MeshletMesh`]. /// Manages data for each entity with a [`MeshletMesh`].
#[derive(Resource)] #[derive(Resource)]
pub struct InstanceManager { pub struct InstanceManager {
/// Amount of clusters in the scene (sum of all meshlet counts across all instances) /// Amount of instances in the scene.
pub scene_instance_count: u32,
/// Amount of clusters in the scene.
pub scene_cluster_count: u32, pub scene_cluster_count: u32,
/// Per-instance [`MainEntity`], [`RenderLayers`], and [`NotShadowCaster`] /// Per-instance [`MainEntity`], [`RenderLayers`], and [`NotShadowCaster`].
pub instances: Vec<(MainEntity, RenderLayers, bool)>, pub instances: Vec<(MainEntity, RenderLayers, bool)>,
/// Per-instance [`MeshUniform`] /// Per-instance [`MeshUniform`].
pub instance_uniforms: StorageBuffer<Vec<MeshUniform>>, pub instance_uniforms: StorageBuffer<Vec<MeshUniform>>,
/// Per-instance material ID /// Per-instance material ID.
pub instance_material_ids: StorageBuffer<Vec<u32>>, pub instance_material_ids: StorageBuffer<Vec<u32>>,
/// Prefix-sum of meshlet counts per instance /// Per-instance count of meshlets in the instance's [`MeshletMesh`].
pub instance_meshlet_counts_prefix_sum: StorageBuffer<Vec<u32>>, pub instance_meshlet_counts: StorageBuffer<Vec<u32>>,
/// Per-instance index to the start of the instance's slice of the meshlets buffer /// Per-instance index to the start of the instance's slice of the meshlets buffer.
pub instance_meshlet_slice_starts: StorageBuffer<Vec<u32>>, pub instance_meshlet_slice_starts: StorageBuffer<Vec<u32>>,
/// Per-view per-instance visibility bit. Used for [`RenderLayers`] and [`NotShadowCaster`] support. /// Per-view per-instance visibility bit. Used for [`RenderLayers`] and [`NotShadowCaster`] support.
pub view_instance_visibility: EntityHashMap<StorageBuffer<Vec<u32>>>, pub view_instance_visibility: EntityHashMap<StorageBuffer<Vec<u32>>>,
/// Next material ID available for a [`Material`] /// Next material ID available for a [`Material`].
next_material_id: u32, next_material_id: u32,
/// Map of [`Material`] to material ID /// Map of [`Material`] to material ID.
material_id_lookup: HashMap<UntypedAssetId, u32>, material_id_lookup: HashMap<UntypedAssetId, u32>,
/// Set of material IDs used in the scene /// Set of material IDs used in the scene.
material_ids_present_in_scene: HashSet<u32>, material_ids_present_in_scene: HashSet<u32>,
} }
impl InstanceManager { impl InstanceManager {
pub fn new() -> Self { pub fn new() -> Self {
Self { Self {
scene_instance_count: 0,
scene_cluster_count: 0, scene_cluster_count: 0,
instances: Vec::new(), instances: Vec::new(),
@ -59,9 +63,9 @@ impl InstanceManager {
buffer.set_label(Some("meshlet_instance_material_ids")); buffer.set_label(Some("meshlet_instance_material_ids"));
buffer buffer
}, },
instance_meshlet_counts_prefix_sum: { instance_meshlet_counts: {
let mut buffer = StorageBuffer::default(); let mut buffer = StorageBuffer::default();
buffer.set_label(Some("meshlet_instance_meshlet_counts_prefix_sum")); buffer.set_label(Some("meshlet_instance_meshlet_counts"));
buffer buffer
}, },
instance_meshlet_slice_starts: { instance_meshlet_slice_starts: {
@ -80,7 +84,7 @@ impl InstanceManager {
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
pub fn add_instance( pub fn add_instance(
&mut self, &mut self,
instance: Entity, instance: MainEntity,
meshlets_slice: Range<u32>, meshlets_slice: Range<u32>,
transform: &GlobalTransform, transform: &GlobalTransform,
previous_transform: Option<&PreviousGlobalTransform>, previous_transform: Option<&PreviousGlobalTransform>,
@ -108,20 +112,21 @@ impl InstanceManager {
// Append instance data // Append instance data
self.instances.push(( self.instances.push((
instance.into(), instance,
render_layers.cloned().unwrap_or(RenderLayers::default()), render_layers.cloned().unwrap_or(RenderLayers::default()),
not_shadow_caster, not_shadow_caster,
)); ));
self.instance_uniforms.get_mut().push(mesh_uniform); self.instance_uniforms.get_mut().push(mesh_uniform);
self.instance_material_ids.get_mut().push(0); self.instance_material_ids.get_mut().push(0);
self.instance_meshlet_counts_prefix_sum self.instance_meshlet_counts
.get_mut() .get_mut()
.push(self.scene_cluster_count); .push(meshlets_slice.len() as u32);
self.instance_meshlet_slice_starts self.instance_meshlet_slice_starts
.get_mut() .get_mut()
.push(meshlets_slice.start); .push(meshlets_slice.start);
self.scene_cluster_count += meshlets_slice.end - meshlets_slice.start; self.scene_instance_count += 1;
self.scene_cluster_count += meshlets_slice.len() as u32;
} }
/// Get the material ID for a [`crate::Material`]. /// Get the material ID for a [`crate::Material`].
@ -140,12 +145,13 @@ impl InstanceManager {
} }
pub fn reset(&mut self, entities: &Entities) { pub fn reset(&mut self, entities: &Entities) {
self.scene_instance_count = 0;
self.scene_cluster_count = 0; self.scene_cluster_count = 0;
self.instances.clear(); self.instances.clear();
self.instance_uniforms.get_mut().clear(); self.instance_uniforms.get_mut().clear();
self.instance_material_ids.get_mut().clear(); self.instance_material_ids.get_mut().clear();
self.instance_meshlet_counts_prefix_sum.get_mut().clear(); self.instance_meshlet_counts.get_mut().clear();
self.instance_meshlet_slice_starts.get_mut().clear(); self.instance_meshlet_slice_starts.get_mut().clear();
self.view_instance_visibility self.view_instance_visibility
.retain(|view_entity, _| entities.contains(*view_entity)); .retain(|view_entity, _| entities.contains(*view_entity));
@ -227,7 +233,7 @@ pub fn extract_meshlet_mesh_entities(
// Add the instance's data to the instance manager // Add the instance's data to the instance manager
instance_manager.add_instance( instance_manager.add_instance(
instance, instance.into(),
meshlets_slice, meshlets_slice,
transform, transform,
previous_transform, previous_transform,

View File

@ -51,15 +51,17 @@ struct DrawIndirectArgs {
const CENTIMETERS_PER_METER = 100.0; const CENTIMETERS_PER_METER = 100.0;
#ifdef MESHLET_FILL_CLUSTER_BUFFERS_PASS #ifdef MESHLET_FILL_CLUSTER_BUFFERS_PASS
var<push_constant> cluster_count: u32; var<push_constant> scene_instance_count: u32;
@group(0) @binding(0) var<storage, read> meshlet_instance_meshlet_counts_prefix_sum: array<u32>; // Per entity instance @group(0) @binding(0) var<storage, read> meshlet_instance_meshlet_counts: array<u32>; // Per entity instance
@group(0) @binding(1) var<storage, read> meshlet_instance_meshlet_slice_starts: array<u32>; // Per entity instance @group(0) @binding(1) var<storage, read> meshlet_instance_meshlet_slice_starts: array<u32>; // Per entity instance
@group(0) @binding(2) var<storage, read_write> meshlet_cluster_instance_ids: array<u32>; // Per cluster @group(0) @binding(2) var<storage, read_write> meshlet_cluster_instance_ids: array<u32>; // Per cluster
@group(0) @binding(3) var<storage, read_write> meshlet_cluster_meshlet_ids: array<u32>; // Per cluster @group(0) @binding(3) var<storage, read_write> meshlet_cluster_meshlet_ids: array<u32>; // Per cluster
@group(0) @binding(4) var<storage, read_write> meshlet_global_cluster_count: atomic<u32>; // Single object shared between all workgroups
#endif #endif
#ifdef MESHLET_CULLING_PASS #ifdef MESHLET_CULLING_PASS
var<push_constant> meshlet_raster_cluster_rightmost_slot: u32; struct Constants { scene_cluster_count: u32, meshlet_raster_cluster_rightmost_slot: u32 }
var<push_constant> constants: Constants;
@group(0) @binding(0) var<storage, read> meshlet_cluster_meshlet_ids: array<u32>; // Per cluster @group(0) @binding(0) var<storage, read> meshlet_cluster_meshlet_ids: array<u32>; // Per cluster
@group(0) @binding(1) var<storage, read> meshlet_bounding_spheres: array<MeshletBoundingSpheres>; // Per meshlet @group(0) @binding(1) var<storage, read> meshlet_bounding_spheres: array<MeshletBoundingSpheres>; // Per meshlet
@group(0) @binding(2) var<storage, read> meshlet_simplification_errors: array<u32>; // Per meshlet @group(0) @binding(2) var<storage, read> meshlet_simplification_errors: array<u32>; // Per meshlet
@ -67,9 +69,9 @@ var<push_constant> meshlet_raster_cluster_rightmost_slot: u32;
@group(0) @binding(4) var<storage, read> meshlet_instance_uniforms: array<Mesh>; // Per entity instance @group(0) @binding(4) var<storage, read> meshlet_instance_uniforms: array<Mesh>; // Per entity instance
@group(0) @binding(5) var<storage, read> meshlet_view_instance_visibility: array<u32>; // 1 bit per entity instance, packed as a bitmask @group(0) @binding(5) var<storage, read> meshlet_view_instance_visibility: array<u32>; // 1 bit per entity instance, packed as a bitmask
@group(0) @binding(6) var<storage, read_write> meshlet_second_pass_candidates: array<atomic<u32>>; // 1 bit per cluster , packed as a bitmask @group(0) @binding(6) var<storage, read_write> meshlet_second_pass_candidates: array<atomic<u32>>; // 1 bit per cluster , packed as a bitmask
@group(0) @binding(7) var<storage, read_write> meshlet_software_raster_indirect_args: DispatchIndirectArgs; // Single object shared between all workgroups/clusters/triangles @group(0) @binding(7) var<storage, read_write> meshlet_software_raster_indirect_args: DispatchIndirectArgs; // Single object shared between all workgroups
@group(0) @binding(8) var<storage, read_write> meshlet_hardware_raster_indirect_args: DrawIndirectArgs; // Single object shared between all workgroups/clusters/triangles @group(0) @binding(8) var<storage, read_write> meshlet_hardware_raster_indirect_args: DrawIndirectArgs; // Single object shared between all workgroups
@group(0) @binding(9) var<storage, read_write> meshlet_raster_clusters: array<u32>; // Single object shared between all workgroups/clusters/triangles @group(0) @binding(9) var<storage, read_write> meshlet_raster_clusters: array<u32>; // Single object shared between all workgroups
@group(0) @binding(10) var depth_pyramid: texture_2d<f32>; // From the end of the last frame for the first culling pass, and from the first raster pass for the second culling pass @group(0) @binding(10) var depth_pyramid: texture_2d<f32>; // From the end of the last frame for the first culling pass, and from the first raster pass for the second culling pass
@group(0) @binding(11) var<uniform> view: View; @group(0) @binding(11) var<uniform> view: View;
@group(0) @binding(12) var<uniform> previous_view: PreviousViewUniforms; @group(0) @binding(12) var<uniform> previous_view: PreviousViewUniforms;
@ -95,7 +97,7 @@ fn cluster_is_second_pass_candidate(cluster_id: u32) -> bool {
@group(0) @binding(3) var<storage, read> meshlet_vertex_positions: array<u32>; // Many per meshlet @group(0) @binding(3) var<storage, read> meshlet_vertex_positions: array<u32>; // Many per meshlet
@group(0) @binding(4) var<storage, read> meshlet_cluster_instance_ids: array<u32>; // Per cluster @group(0) @binding(4) var<storage, read> meshlet_cluster_instance_ids: array<u32>; // Per cluster
@group(0) @binding(5) var<storage, read> meshlet_instance_uniforms: array<Mesh>; // Per entity instance @group(0) @binding(5) var<storage, read> meshlet_instance_uniforms: array<Mesh>; // Per entity instance
@group(0) @binding(6) var<storage, read> meshlet_raster_clusters: array<u32>; // Single object shared between all workgroups/clusters/triangles @group(0) @binding(6) var<storage, read> meshlet_raster_clusters: array<u32>; // Single object shared between all workgroups
@group(0) @binding(7) var<storage, read> meshlet_software_raster_cluster_count: u32; @group(0) @binding(7) var<storage, read> meshlet_software_raster_cluster_count: u32;
#ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT #ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT
@group(0) @binding(8) var<storage, read_write> meshlet_visibility_buffer: array<atomic<u64>>; // Per pixel @group(0) @binding(8) var<storage, read_write> meshlet_visibility_buffer: array<atomic<u64>>; // Per pixel

View File

@ -56,7 +56,7 @@ use self::{
}, },
visibility_buffer_raster_node::MeshletVisibilityBufferRasterPassNode, visibility_buffer_raster_node::MeshletVisibilityBufferRasterPassNode,
}; };
use crate::{graph::NodePbr, Material, MeshMaterial3d}; use crate::{graph::NodePbr, Material, MeshMaterial3d, PreviousGlobalTransform};
use bevy_app::{App, Plugin, PostUpdate}; use bevy_app::{App, Plugin, PostUpdate};
use bevy_asset::{load_internal_asset, AssetApp, AssetId, Handle}; use bevy_asset::{load_internal_asset, AssetApp, AssetId, Handle};
use bevy_core_pipeline::{ use bevy_core_pipeline::{
@ -129,6 +129,8 @@ pub struct MeshletPlugin {
/// If this number is too low, you'll see rendering artifacts like missing or blinking meshes. /// If this number is too low, you'll see rendering artifacts like missing or blinking meshes.
/// ///
/// Each cluster slot costs 4 bytes of VRAM. /// Each cluster slot costs 4 bytes of VRAM.
///
/// Must not be greater than 2^25.
pub cluster_buffer_slots: u32, pub cluster_buffer_slots: u32,
} }
@ -147,6 +149,11 @@ impl Plugin for MeshletPlugin {
#[cfg(target_endian = "big")] #[cfg(target_endian = "big")]
compile_error!("MeshletPlugin is only supported on little-endian processors."); compile_error!("MeshletPlugin is only supported on little-endian processors.");
if self.cluster_buffer_slots > 2_u32.pow(25) {
error!("MeshletPlugin::cluster_buffer_slots must not be greater than 2^25.");
std::process::exit(1);
}
load_internal_asset!( load_internal_asset!(
app, app,
MESHLET_BINDINGS_SHADER_HANDLE, MESHLET_BINDINGS_SHADER_HANDLE,
@ -293,7 +300,7 @@ impl Plugin for MeshletPlugin {
/// The meshlet mesh equivalent of [`bevy_render::mesh::Mesh3d`]. /// The meshlet mesh equivalent of [`bevy_render::mesh::Mesh3d`].
#[derive(Component, Clone, Debug, Default, Deref, DerefMut, Reflect, PartialEq, Eq, From)] #[derive(Component, Clone, Debug, Default, Deref, DerefMut, Reflect, PartialEq, Eq, From)]
#[reflect(Component, Default)] #[reflect(Component, Default)]
#[require(Transform, Visibility)] #[require(Transform, PreviousGlobalTransform, Visibility)]
pub struct MeshletMesh3d(pub Handle<MeshletMesh>); pub struct MeshletMesh3d(pub Handle<MeshletMesh>);
impl From<MeshletMesh3d> for AssetId<MeshletMesh> { impl From<MeshletMesh3d> for AssetId<MeshletMesh> {

View File

@ -84,7 +84,7 @@ impl FromWorld for MeshletPipelines {
layout: vec![cull_layout.clone()], layout: vec![cull_layout.clone()],
push_constant_ranges: vec![PushConstantRange { push_constant_ranges: vec![PushConstantRange {
stages: ShaderStages::COMPUTE, stages: ShaderStages::COMPUTE,
range: 0..4, range: 0..8,
}], }],
shader: MESHLET_CULLING_SHADER_HANDLE, shader: MESHLET_CULLING_SHADER_HANDLE,
shader_defs: vec![ shader_defs: vec![
@ -99,7 +99,7 @@ impl FromWorld for MeshletPipelines {
layout: vec![cull_layout], layout: vec![cull_layout],
push_constant_ranges: vec![PushConstantRange { push_constant_ranges: vec![PushConstantRange {
stages: ShaderStages::COMPUTE, stages: ShaderStages::COMPUTE,
range: 0..4, range: 0..8,
}], }],
shader: MESHLET_CULLING_SHADER_HANDLE, shader: MESHLET_CULLING_SHADER_HANDLE,
shader_defs: vec![ shader_defs: vec![
@ -441,7 +441,10 @@ impl FromWorld for MeshletPipelines {
pipeline_cache.queue_compute_pipeline(ComputePipelineDescriptor { pipeline_cache.queue_compute_pipeline(ComputePipelineDescriptor {
label: Some("meshlet_remap_1d_to_2d_dispatch_pipeline".into()), label: Some("meshlet_remap_1d_to_2d_dispatch_pipeline".into()),
layout: vec![layout], layout: vec![layout],
push_constant_ranges: vec![], push_constant_ranges: vec![PushConstantRange {
stages: ShaderStages::COMPUTE,
range: 0..4,
}],
shader: MESHLET_REMAP_1D_TO_2D_DISPATCH_SHADER_HANDLE, shader: MESHLET_REMAP_1D_TO_2D_DISPATCH_SHADER_HANDLE,
shader_defs: vec![], shader_defs: vec![],
entry_point: "remap_dispatch".into(), entry_point: "remap_dispatch".into(),

View File

@ -8,13 +8,16 @@ struct DispatchIndirectArgs {
@group(0) @binding(0) var<storage, read_write> meshlet_software_raster_indirect_args: DispatchIndirectArgs; @group(0) @binding(0) var<storage, read_write> meshlet_software_raster_indirect_args: DispatchIndirectArgs;
@group(0) @binding(1) var<storage, read_write> meshlet_software_raster_cluster_count: u32; @group(0) @binding(1) var<storage, read_write> meshlet_software_raster_cluster_count: u32;
var<push_constant> max_compute_workgroups_per_dimension: u32;
@compute @compute
@workgroup_size(1, 1, 1) @workgroup_size(1, 1, 1)
fn remap_dispatch() { fn remap_dispatch() {
meshlet_software_raster_cluster_count = meshlet_software_raster_indirect_args.x; meshlet_software_raster_cluster_count = meshlet_software_raster_indirect_args.x;
let n = u32(ceil(sqrt(f32(meshlet_software_raster_indirect_args.x)))); if meshlet_software_raster_cluster_count > max_compute_workgroups_per_dimension {
meshlet_software_raster_indirect_args.x = n; let n = u32(ceil(sqrt(f32(meshlet_software_raster_cluster_count))));
meshlet_software_raster_indirect_args.y = n; meshlet_software_raster_indirect_args.x = n;
meshlet_software_raster_indirect_args.y = n;
}
} }

View File

@ -122,6 +122,7 @@ impl ResourceManager {
storage_buffer_read_only_sized(false, None), storage_buffer_read_only_sized(false, None),
storage_buffer_sized(false, None), storage_buffer_sized(false, None),
storage_buffer_sized(false, None), storage_buffer_sized(false, None),
storage_buffer_sized(false, None),
), ),
), ),
), ),
@ -246,6 +247,7 @@ impl ResourceManager {
#[derive(Component)] #[derive(Component)]
pub struct MeshletViewResources { pub struct MeshletViewResources {
pub scene_instance_count: u32,
pub scene_cluster_count: u32, pub scene_cluster_count: u32,
pub second_pass_candidates_buffer: Buffer, pub second_pass_candidates_buffer: Buffer,
instance_visibility: Buffer, instance_visibility: Buffer,
@ -330,7 +332,7 @@ pub fn prepare_meshlet_per_frame_resources(
&render_queue, &render_queue,
); );
upload_storage_buffer( upload_storage_buffer(
&mut instance_manager.instance_meshlet_counts_prefix_sum, &mut instance_manager.instance_meshlet_counts,
&render_device, &render_device,
&render_queue, &render_queue,
); );
@ -340,9 +342,6 @@ pub fn prepare_meshlet_per_frame_resources(
&render_queue, &render_queue,
); );
// Early submission for GPU data uploads to start while the render graph records commands
render_queue.submit([]);
let needed_buffer_size = 4 * instance_manager.scene_cluster_count as u64; let needed_buffer_size = 4 * instance_manager.scene_cluster_count as u64;
match &mut resource_manager.cluster_instance_ids { match &mut resource_manager.cluster_instance_ids {
Some(buffer) if buffer.size() >= needed_buffer_size => buffer.clone(), Some(buffer) if buffer.size() >= needed_buffer_size => buffer.clone(),
@ -553,6 +552,7 @@ pub fn prepare_meshlet_per_frame_resources(
}; };
commands.entity(view_entity).insert(MeshletViewResources { commands.entity(view_entity).insert(MeshletViewResources {
scene_instance_count: instance_manager.scene_instance_count,
scene_cluster_count: instance_manager.scene_cluster_count, scene_cluster_count: instance_manager.scene_cluster_count,
second_pass_candidates_buffer, second_pass_candidates_buffer,
instance_visibility, instance_visibility,
@ -602,19 +602,25 @@ pub fn prepare_meshlet_view_bind_groups(
let first_node = Arc::new(AtomicBool::new(true)); let first_node = Arc::new(AtomicBool::new(true));
let fill_cluster_buffers_global_cluster_count =
render_device.create_buffer(&BufferDescriptor {
label: Some("meshlet_fill_cluster_buffers_global_cluster_count"),
size: 4,
usage: BufferUsages::STORAGE,
mapped_at_creation: false,
});
// TODO: Some of these bind groups can be reused across multiple views // TODO: Some of these bind groups can be reused across multiple views
for (view_entity, view_resources) in &views { for (view_entity, view_resources) in &views {
let entries = BindGroupEntries::sequential(( let entries = BindGroupEntries::sequential((
instance_manager instance_manager.instance_meshlet_counts.binding().unwrap(),
.instance_meshlet_counts_prefix_sum
.binding()
.unwrap(),
instance_manager instance_manager
.instance_meshlet_slice_starts .instance_meshlet_slice_starts
.binding() .binding()
.unwrap(), .unwrap(),
cluster_instance_ids.as_entire_binding(), cluster_instance_ids.as_entire_binding(),
cluster_meshlet_ids.as_entire_binding(), cluster_meshlet_ids.as_entire_binding(),
fill_cluster_buffers_global_cluster_count.as_entire_binding(),
)); ));
let fill_cluster_buffers = render_device.create_bind_group( let fill_cluster_buffers = render_device.create_bind_group(
"meshlet_fill_cluster_buffers", "meshlet_fill_cluster_buffers",

View File

@ -118,8 +118,7 @@ impl Node for MeshletVisibilityBufferRasterPassNode {
render_context, render_context,
&meshlet_view_bind_groups.fill_cluster_buffers, &meshlet_view_bind_groups.fill_cluster_buffers,
fill_cluster_buffers_pipeline, fill_cluster_buffers_pipeline,
thread_per_cluster_workgroups, meshlet_view_resources.scene_instance_count,
meshlet_view_resources.scene_cluster_count,
); );
} }
cull_pass( cull_pass(
@ -130,6 +129,7 @@ impl Node for MeshletVisibilityBufferRasterPassNode {
previous_view_offset, previous_view_offset,
culling_first_pipeline, culling_first_pipeline,
thread_per_cluster_workgroups, thread_per_cluster_workgroups,
meshlet_view_resources.scene_cluster_count,
meshlet_view_resources.raster_cluster_rightmost_slot, meshlet_view_resources.raster_cluster_rightmost_slot,
meshlet_view_bind_groups meshlet_view_bind_groups
.remap_1d_to_2d_dispatch .remap_1d_to_2d_dispatch
@ -165,6 +165,7 @@ impl Node for MeshletVisibilityBufferRasterPassNode {
previous_view_offset, previous_view_offset,
culling_second_pipeline, culling_second_pipeline,
thread_per_cluster_workgroups, thread_per_cluster_workgroups,
meshlet_view_resources.scene_cluster_count,
meshlet_view_resources.raster_cluster_rightmost_slot, meshlet_view_resources.raster_cluster_rightmost_slot,
meshlet_view_bind_groups meshlet_view_bind_groups
.remap_1d_to_2d_dispatch .remap_1d_to_2d_dispatch
@ -253,6 +254,7 @@ impl Node for MeshletVisibilityBufferRasterPassNode {
previous_view_offset, previous_view_offset,
culling_first_pipeline, culling_first_pipeline,
thread_per_cluster_workgroups, thread_per_cluster_workgroups,
meshlet_view_resources.scene_cluster_count,
meshlet_view_resources.raster_cluster_rightmost_slot, meshlet_view_resources.raster_cluster_rightmost_slot,
meshlet_view_bind_groups meshlet_view_bind_groups
.remap_1d_to_2d_dispatch .remap_1d_to_2d_dispatch
@ -288,6 +290,7 @@ impl Node for MeshletVisibilityBufferRasterPassNode {
previous_view_offset, previous_view_offset,
culling_second_pipeline, culling_second_pipeline,
thread_per_cluster_workgroups, thread_per_cluster_workgroups,
meshlet_view_resources.scene_cluster_count,
meshlet_view_resources.raster_cluster_rightmost_slot, meshlet_view_resources.raster_cluster_rightmost_slot,
meshlet_view_bind_groups meshlet_view_bind_groups
.remap_1d_to_2d_dispatch .remap_1d_to_2d_dispatch
@ -334,21 +337,32 @@ fn fill_cluster_buffers_pass(
render_context: &mut RenderContext, render_context: &mut RenderContext,
fill_cluster_buffers_bind_group: &BindGroup, fill_cluster_buffers_bind_group: &BindGroup,
fill_cluster_buffers_pass_pipeline: &ComputePipeline, fill_cluster_buffers_pass_pipeline: &ComputePipeline,
fill_cluster_buffers_pass_workgroups: u32, scene_instance_count: u32,
cluster_count: u32,
) { ) {
let mut fill_cluster_buffers_pass_workgroups_x = scene_instance_count;
let mut fill_cluster_buffers_pass_workgroups_y = 1;
if scene_instance_count
> render_context
.render_device()
.limits()
.max_compute_workgroups_per_dimension
{
fill_cluster_buffers_pass_workgroups_x = (scene_instance_count as f32).sqrt().ceil() as u32;
fill_cluster_buffers_pass_workgroups_y = fill_cluster_buffers_pass_workgroups_x;
}
let command_encoder = render_context.command_encoder(); let command_encoder = render_context.command_encoder();
let mut fill_pass = command_encoder.begin_compute_pass(&ComputePassDescriptor { let mut fill_pass = command_encoder.begin_compute_pass(&ComputePassDescriptor {
label: Some("fill_cluster_buffers"), label: Some("fill_cluster_buffers"),
timestamp_writes: None, timestamp_writes: None,
}); });
fill_pass.set_pipeline(fill_cluster_buffers_pass_pipeline); fill_pass.set_pipeline(fill_cluster_buffers_pass_pipeline);
fill_pass.set_push_constants(0, &cluster_count.to_le_bytes()); fill_pass.set_push_constants(0, &scene_instance_count.to_le_bytes());
fill_pass.set_bind_group(0, fill_cluster_buffers_bind_group, &[]); fill_pass.set_bind_group(0, fill_cluster_buffers_bind_group, &[]);
fill_pass.dispatch_workgroups( fill_pass.dispatch_workgroups(
fill_cluster_buffers_pass_workgroups, fill_cluster_buffers_pass_workgroups_x,
fill_cluster_buffers_pass_workgroups, fill_cluster_buffers_pass_workgroups_y,
fill_cluster_buffers_pass_workgroups, 1,
); );
} }
@ -361,17 +375,26 @@ fn cull_pass(
previous_view_offset: &PreviousViewUniformOffset, previous_view_offset: &PreviousViewUniformOffset,
culling_pipeline: &ComputePipeline, culling_pipeline: &ComputePipeline,
culling_workgroups: u32, culling_workgroups: u32,
scene_cluster_count: u32,
raster_cluster_rightmost_slot: u32, raster_cluster_rightmost_slot: u32,
remap_1d_to_2d_dispatch_bind_group: Option<&BindGroup>, remap_1d_to_2d_dispatch_bind_group: Option<&BindGroup>,
remap_1d_to_2d_dispatch_pipeline: Option<&ComputePipeline>, remap_1d_to_2d_dispatch_pipeline: Option<&ComputePipeline>,
) { ) {
let max_compute_workgroups_per_dimension = render_context
.render_device()
.limits()
.max_compute_workgroups_per_dimension;
let command_encoder = render_context.command_encoder(); let command_encoder = render_context.command_encoder();
let mut cull_pass = command_encoder.begin_compute_pass(&ComputePassDescriptor { let mut cull_pass = command_encoder.begin_compute_pass(&ComputePassDescriptor {
label: Some(label), label: Some(label),
timestamp_writes: None, timestamp_writes: None,
}); });
cull_pass.set_pipeline(culling_pipeline); cull_pass.set_pipeline(culling_pipeline);
cull_pass.set_push_constants(0, &raster_cluster_rightmost_slot.to_le_bytes()); cull_pass.set_push_constants(
0,
bytemuck::cast_slice(&[scene_cluster_count, raster_cluster_rightmost_slot]),
);
cull_pass.set_bind_group( cull_pass.set_bind_group(
0, 0,
culling_bind_group, culling_bind_group,
@ -384,6 +407,7 @@ fn cull_pass(
remap_1d_to_2d_dispatch_bind_group, remap_1d_to_2d_dispatch_bind_group,
) { ) {
cull_pass.set_pipeline(remap_1d_to_2d_dispatch_pipeline); cull_pass.set_pipeline(remap_1d_to_2d_dispatch_pipeline);
cull_pass.set_push_constants(0, &max_compute_workgroups_per_dimension.to_be_bytes());
cull_pass.set_bind_group(0, remap_1d_to_2d_dispatch_bind_group, &[]); cull_pass.set_bind_group(0, remap_1d_to_2d_dispatch_bind_group, &[]);
cull_pass.dispatch_workgroups(1, 1, 1); cull_pass.dispatch_workgroups(1, 1, 1);
} }

View File

@ -172,7 +172,7 @@ fn resolve_vertex_output(frag_coord: vec4<f32>) -> VertexOutput {
ddy_uv, ddy_uv,
world_tangent, world_tangent,
instance_uniform.flags, instance_uniform.flags,
cluster_id, instance_id ^ meshlet_id,
#ifdef PREPASS_FRAGMENT #ifdef PREPASS_FRAGMENT
#ifdef MOTION_VECTOR_PREPASS #ifdef MOTION_VECTOR_PREPASS
motion_vector, motion_vector,

View File

@ -216,7 +216,7 @@ pub fn update_previous_view_data(
} }
} }
#[derive(Component)] #[derive(Component, Default)]
pub struct PreviousGlobalTransform(pub Affine3A); pub struct PreviousGlobalTransform(pub Affine3A);
#[cfg(not(feature = "meshlet"))] #[cfg(not(feature = "meshlet"))]