bevy/crates/bevy_pbr/src/meshlet/cull_bvh.wgsl
atlv 57e58ef997
Meshlet BVH Culling (#19318)
# Objective

- Merge @SparkyPotato 's efforts to implement BVH-accelerated meshlet
culling.

## Solution

- Add hot reloading support
- Fix near-plane overculling
- Fix hzb sampling
- Fix orthographic error metric

## Testing

- Meshlet example, Nsight, hot-reloading and careful thinking

---------

Co-authored-by: SparkyPotato <noob.sparkypotato@gmail.com>
Co-authored-by: JMS55 <47158642+JMS55@users.noreply.github.com>
Co-authored-by: charlotte <charlotte.c.mcelwain@gmail.com>
2025-06-29 00:04:21 +00:00

111 lines
4.2 KiB
WebGPU Shading Language

#import bevy_pbr::meshlet_bindings::{
InstancedOffset,
get_aabb,
get_aabb_error,
get_aabb_child_offset,
constants,
meshlet_bvh_nodes,
meshlet_bvh_cull_count_read,
meshlet_bvh_cull_count_write,
meshlet_bvh_cull_dispatch,
meshlet_bvh_cull_queue,
meshlet_meshlet_cull_count_early,
meshlet_meshlet_cull_count_late,
meshlet_meshlet_cull_dispatch_early,
meshlet_meshlet_cull_dispatch_late,
meshlet_meshlet_cull_queue,
meshlet_second_pass_bvh_count,
meshlet_second_pass_bvh_dispatch,
meshlet_second_pass_bvh_queue,
}
#import bevy_pbr::meshlet_cull_shared::{
lod_error_is_imperceptible,
aabb_in_frustum,
should_occlusion_cull_aabb,
}
@compute
@workgroup_size(128, 1, 1) // 8 threads per node, 16 nodes per workgroup
fn cull_bvh(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
// Calculate the queue ID for this thread
let dispatch_id = global_invocation_id.x;
var node = dispatch_id >> 3u;
let subnode = dispatch_id & 7u;
if node >= meshlet_bvh_cull_count_read { return; }
node = select(node, constants.rightmost_slot - node, constants.read_from_front == 0u);
let instanced_offset = meshlet_bvh_cull_queue[node];
let instance_id = instanced_offset.instance_id;
let bvh_node = &meshlet_bvh_nodes[instanced_offset.offset];
var aabb_error_offset = (*bvh_node).aabbs[subnode];
let aabb = get_aabb(&aabb_error_offset);
let parent_error = get_aabb_error(&aabb_error_offset);
let lod_sphere = (*bvh_node).lod_bounds[subnode];
let parent_is_imperceptible = lod_error_is_imperceptible(lod_sphere, parent_error, instance_id);
// Error and frustum cull, in both passes
if parent_is_imperceptible || !aabb_in_frustum(aabb, instance_id) { return; }
let child_offset = get_aabb_child_offset(&aabb_error_offset);
let index = subnode >> 2u;
let bit_offset = subnode & 3u;
let packed_child_count = (*bvh_node).child_counts[index];
let child_count = extractBits(packed_child_count, bit_offset * 8u, 8u);
var value = InstancedOffset(instance_id, child_offset);
// If we pass, try occlusion culling
// If this node was occluded, push it's children to the second pass to check against this frame's HZB
if should_occlusion_cull_aabb(aabb, instance_id) {
#ifdef MESHLET_FIRST_CULLING_PASS
if child_count == 255u {
let id = atomicAdd(&meshlet_second_pass_bvh_count, 1u);
meshlet_second_pass_bvh_queue[id] = value;
if ((id & 15u) == 0u) {
atomicAdd(&meshlet_second_pass_bvh_dispatch.x, 1u);
}
} else {
let base = atomicAdd(&meshlet_meshlet_cull_count_late, child_count);
let start = constants.rightmost_slot - base;
for (var i = start; i < start - child_count; i--) {
meshlet_meshlet_cull_queue[i] = value;
value.offset += 1u;
}
let req = (base + child_count + 127u) >> 7u;
atomicMax(&meshlet_meshlet_cull_dispatch_late.x, req);
}
#endif
return;
}
// If we pass, push the children to the next BVH cull
if child_count == 255u {
let id = atomicAdd(&meshlet_bvh_cull_count_write, 1u);
let index = select(constants.rightmost_slot - id, id, constants.read_from_front == 0u);
meshlet_bvh_cull_queue[index] = value;
if ((id & 15u) == 0u) {
atomicAdd(&meshlet_bvh_cull_dispatch.x, 1u);
}
} else {
#ifdef MESHLET_FIRST_CULLING_PASS
let base = atomicAdd(&meshlet_meshlet_cull_count_early, child_count);
let end = base + child_count;
for (var i = base; i < end; i++) {
meshlet_meshlet_cull_queue[i] = value;
value.offset += 1u;
}
let req = (end + 127u) >> 7u;
atomicMax(&meshlet_meshlet_cull_dispatch_early.x, req);
#else
let base = atomicAdd(&meshlet_meshlet_cull_count_late, child_count);
let start = constants.rightmost_slot - base;
for (var i = start; i < start - child_count; i--) {
meshlet_meshlet_cull_queue[i] = value;
value.offset += 1u;
}
let req = (base + child_count + 127u) >> 7u;
atomicMax(&meshlet_meshlet_cull_dispatch_late.x, req);
#endif
}
}