forestiles/assets/render/mesh_preprocess.wgsl

// GPU mesh transforming and culling.
//
// This is a compute shader that expands each `MeshInputUniform` out to a full
// `MeshUniform` for each view before rendering. (Thus `MeshInputUniform` and
// `MeshUniform` are in a 1:N relationship.) It runs in parallel for all meshes
// for all views. As part of this process, the shader gathers each mesh's
// transform on the previous frame and writes it into the `MeshUniform` so that
// TAA works. It also performs frustum culling and occlusion culling, if
// requested.
//
// If occlusion culling is on, this shader runs twice: once to prepare the
// meshes that were visible last frame, and once to prepare the meshes that
// weren't visible last frame but became visible this frame. The two invocations
// are known as *early mesh preprocessing* and *late mesh preprocessing*
// respectively.

#import bevy_pbr::mesh_preprocess_types::{
    IndirectParametersCpuMetadata, IndirectParametersGpuMetadata, MeshInput
}
#import bevy_pbr::mesh_types::{Mesh, MESH_FLAGS_NO_FRUSTUM_CULLING_BIT}
#import bevy_pbr::mesh_view_bindings::view
#import bevy_pbr::occlusion_culling
#import bevy_pbr::prepass_bindings::previous_view_uniforms
#import bevy_pbr::view_transformations::{
    position_world_to_ndc, position_world_to_view, ndc_to_uv, view_z_to_depth_ndc,
    position_world_to_prev_ndc, position_world_to_prev_view, prev_view_z_to_depth_ndc
}
#import bevy_render::maths
#import bevy_render::view::View

// Information about each mesh instance needed to cull it on GPU.
//
// At the moment, this just consists of its axis-aligned bounding box (AABB).
struct MeshCullingData {
    // The 3D center of the AABB in model space, padded with an extra unused
    // float value.
    aabb_center: vec4<f32>,
    // The 3D extents of the AABB in model space, divided by two, padded with
    // an extra unused float value.
    aabb_half_extents: vec4<f32>,
}

// One invocation of this compute shader: i.e. one mesh instance in a view.
struct PreprocessWorkItem {
    // The index of the `MeshInput` in the `current_input` buffer that we read
    // from.
    input_index: u32,
    // In direct mode, the index of the `Mesh` in `output` that we write to. In
    // indirect mode, the index of the `IndirectParameters` in
    // `indirect_parameters` that we write to.
    output_or_indirect_parameters_index: u32,
}

// The parameters for the indirect compute dispatch for the late mesh
// preprocessing phase.
struct LatePreprocessWorkItemIndirectParameters {
    // The number of workgroups we're going to dispatch.
    //
    // This value should always be equal to `ceil(work_item_count / 64)`.
    dispatch_x: atomic<u32>,
    // The number of workgroups in the Y direction; always 1.
    dispatch_y: u32,
    // The number of workgroups in the Z direction; always 1.
    dispatch_z: u32,
    // The precise number of work items.
    work_item_count: atomic<u32>,
    // Padding.
    //
    // This isn't the usual structure padding; it's needed because some hardware
    // requires indirect compute dispatch parameters to be aligned on 64-byte
    // boundaries.
    pad: vec4<u32>,
}

// These have to be in a structure because of Naga limitations on DX12.
struct PushConstants {
    // The offset into the `late_preprocess_work_item_indirect_parameters`
    // buffer.
    late_preprocess_work_item_indirect_offset: u32,
}

// The current frame's `MeshInput`.
@group(0) @binding(3) var<storage> current_input: array<MeshInput>;
// The `MeshInput` values from the previous frame.
@group(0) @binding(4) var<storage> previous_input: array<MeshInput>;
// Indices into the `MeshInput` buffer.
//
// There may be many indices that map to the same `MeshInput`.
@group(0) @binding(5) var<storage> work_items: array<PreprocessWorkItem>;
// The output array of `Mesh`es.
@group(0) @binding(6) var<storage, read_write> output: array<Mesh>;

#ifdef INDIRECT
// The array of indirect parameters for drawcalls.
@group(0) @binding(7) var<storage> indirect_parameters_cpu_metadata:
    array<IndirectParametersCpuMetadata>;

@group(0) @binding(8) var<storage, read_write> indirect_parameters_gpu_metadata:
    array<IndirectParametersGpuMetadata>;
#endif

#ifdef FRUSTUM_CULLING
// Data needed to cull the meshes.
//
// At the moment, this consists only of AABBs.
@group(0) @binding(9) var<storage> mesh_culling_data: array<MeshCullingData>;
#endif  // FRUSTUM_CULLING

#ifdef OCCLUSION_CULLING
@group(0) @binding(10) var depth_pyramid: texture_2d<f32>;

#ifdef EARLY_PHASE
@group(0) @binding(11) var<storage, read_write> late_preprocess_work_items:
    array<PreprocessWorkItem>;
#endif  // EARLY_PHASE

@group(0) @binding(12) var<storage, read_write> late_preprocess_work_item_indirect_parameters:
    array<LatePreprocessWorkItemIndirectParameters>;

var<push_constant> push_constants: PushConstants;
#endif  // OCCLUSION_CULLING

#ifdef FRUSTUM_CULLING
// Returns true if the view frustum intersects an oriented bounding box (OBB).
//
// `aabb_center.w` should be 1.0.
fn view_frustum_intersects_obb(
    world_from_local: mat4x4<f32>,
    aabb_center: vec4<f32>,
    aabb_half_extents: vec3<f32>,
) -> bool {

    for (var i = 0; i < 5; i += 1) {
        // Calculate relative radius of the sphere associated with this plane.
        let plane_normal = view.frustum[i];
        let relative_radius = dot(
            abs(
                vec3(
                    dot(plane_normal.xyz, world_from_local[0].xyz),
                    dot(plane_normal.xyz, world_from_local[1].xyz),
                    dot(plane_normal.xyz, world_from_local[2].xyz),
                )
            ),
            aabb_half_extents
        );

        // Check the frustum plane.
        if (!maths::sphere_intersects_plane_half_space(
                plane_normal, aabb_center, relative_radius)) {
            return false;
        }
    }

    return true;
}
#endif

@compute
@workgroup_size(64)
fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
    // Figure out our instance index. If this thread doesn't correspond to any
    // index, bail.
    let instance_index = global_invocation_id.x;

#ifdef LATE_PHASE
    if (instance_index >= atomicLoad(&late_preprocess_work_item_indirect_parameters[
            push_constants.late_preprocess_work_item_indirect_offset].work_item_count)) {
        return;
    }
#else   // LATE_PHASE
    if (instance_index >= arrayLength(&work_items)) {
        return;
    }
#endif

    // Unpack the work item.
    let input_index = work_items[instance_index].input_index;
#ifdef INDIRECT
    let indirect_parameters_index = work_items[instance_index].output_or_indirect_parameters_index;

    // If we're the first mesh instance in this batch, write the index of our
    // `MeshInput` into the appropriate slot so that the indirect parameters
    // building shader can access it.
#ifndef LATE_PHASE
    if (instance_index == 0u || work_items[instance_index - 1].output_or_indirect_parameters_index != indirect_parameters_index) {
        indirect_parameters_gpu_metadata[indirect_parameters_index].mesh_index = input_index;
    }
#endif  // LATE_PHASE

#else   // INDIRECT
    let mesh_output_index = work_items[instance_index].output_or_indirect_parameters_index;
#endif  // INDIRECT

    // Unpack the input matrix.
    let world_from_local_affine_transpose = current_input[input_index].world_from_local;
    let world_from_local = maths::affine3_to_square(world_from_local_affine_transpose);

    // Frustum cull if necessary.
#ifdef FRUSTUM_CULLING
    if ((current_input[input_index].flags & MESH_FLAGS_NO_FRUSTUM_CULLING_BIT) == 0u) {
        let aabb_center = mesh_culling_data[input_index].aabb_center.xyz;
        let aabb_half_extents = mesh_culling_data[input_index].aabb_half_extents.xyz;

        // Do an OBB-based frustum cull.
        let model_center = world_from_local * vec4(aabb_center, 1.0);
        if (!view_frustum_intersects_obb(world_from_local, model_center, aabb_half_extents)) {
            return;
        }
    }
#endif

    // See whether the `MeshInputUniform` was updated on this frame. If it
    // wasn't, then we know the transforms of this mesh must be identical to
    // those on the previous frame, and therefore we don't need to access the
    // `previous_input_index` (in fact, we can't; that index are only valid for
    // one frame and will be invalid).
    let timestamp = current_input[input_index].timestamp;
    let mesh_changed_this_frame = timestamp == view.frame_count;

    // Look up the previous model matrix, if it could have been.
    let previous_input_index = current_input[input_index].previous_input_index;
    var previous_world_from_local_affine_transpose: mat3x4<f32>;
    if (mesh_changed_this_frame && previous_input_index != 0xffffffffu) {
        previous_world_from_local_affine_transpose =
            previous_input[previous_input_index].world_from_local;
    } else {
        previous_world_from_local_affine_transpose = world_from_local_affine_transpose;
    }
    let previous_world_from_local =
        maths::affine3_to_square(previous_world_from_local_affine_transpose);

    // Occlusion cull if necessary. This is done by calculating the screen-space
    // axis-aligned bounding box (AABB) of the mesh and testing it against the
    // appropriate level of the depth pyramid (a.k.a. hierarchical Z-buffer). If
    // no part of the AABB is in front of the corresponding pixel quad in the
    // hierarchical Z-buffer, then this mesh must be occluded, and we can skip
    // rendering it.
#ifdef OCCLUSION_CULLING
    let aabb_center = mesh_culling_data[input_index].aabb_center.xyz;
    let aabb_half_extents = mesh_culling_data[input_index].aabb_half_extents.xyz;

    // Initialize the AABB and the maximum depth.
    let infinity = bitcast<f32>(0x7f800000u);
    let neg_infinity = bitcast<f32>(0xff800000u);
    var aabb = vec4(infinity, infinity, neg_infinity, neg_infinity);
    var max_depth_view = neg_infinity;

    // Build up the AABB by taking each corner of this mesh's OBB, transforming
    // it, and updating the AABB and depth accordingly.
    for (var i = 0u; i < 8u; i += 1u) {
        let local_pos = aabb_center + select(
            vec3(-1.0),
            vec3(1.0),
            vec3((i & 1) != 0, (i & 2) != 0, (i & 4) != 0)
        ) * aabb_half_extents;

#ifdef EARLY_PHASE
        // If we're in the early phase, we're testing against the last frame's
        // depth buffer, so we need to use the previous frame's transform.
        let prev_world_pos = (previous_world_from_local * vec4(local_pos, 1.0)).xyz;
        let view_pos = position_world_to_prev_view(prev_world_pos);
        let ndc_pos = position_world_to_prev_ndc(prev_world_pos);
#else   // EARLY_PHASE
        // Otherwise, if this is the late phase, we use the current frame's
        // transform.
        let world_pos = (world_from_local * vec4(local_pos, 1.0)).xyz;
        let view_pos = position_world_to_view(world_pos);
        let ndc_pos = position_world_to_ndc(world_pos);
#endif  // EARLY_PHASE

        let uv_pos = ndc_to_uv(ndc_pos.xy);

        // Update the AABB and maximum view-space depth.
        aabb = vec4(min(aabb.xy, uv_pos), max(aabb.zw, uv_pos));
        max_depth_view = max(max_depth_view, view_pos.z);
    }

    // Clip to the near plane to avoid the NDC depth becoming negative.
#ifdef EARLY_PHASE
    max_depth_view = min(-previous_view_uniforms.clip_from_view[3][2], max_depth_view);
#else   // EARLY_PHASE
    max_depth_view = min(-view.clip_from_view[3][2], max_depth_view);
#endif  // EARLY_PHASE

    // Figure out the depth of the occluder, and compare it to our own depth.

    let aabb_pixel_size = occlusion_culling::get_aabb_size_in_pixels(aabb, depth_pyramid);
    let occluder_depth_ndc =
        occlusion_culling::get_occluder_depth(aabb, aabb_pixel_size, depth_pyramid);

#ifdef EARLY_PHASE
    let max_depth_ndc = prev_view_z_to_depth_ndc(max_depth_view);
#else   // EARLY_PHASE
    let max_depth_ndc = view_z_to_depth_ndc(max_depth_view);
#endif

    // Are we culled out?
    if (max_depth_ndc < occluder_depth_ndc) {
#ifdef EARLY_PHASE
        // If this is the early phase, we need to make a note of this mesh so
        // that we examine it again in the late phase, so that we handle the
        // case in which a mesh that was invisible last frame became visible in
        // this frame.
        let output_work_item_index = atomicAdd(&late_preprocess_work_item_indirect_parameters[
            push_constants.late_preprocess_work_item_indirect_offset].work_item_count, 1u);
        if (output_work_item_index % 64u == 0u) {
            // Our workgroup size is 64, and the indirect parameters for the
            // late mesh preprocessing phase are counted in workgroups, so if
            // we're the first thread in this workgroup, bump the workgroup
            // count.
            atomicAdd(&late_preprocess_work_item_indirect_parameters[
                push_constants.late_preprocess_work_item_indirect_offset].dispatch_x, 1u);
        }

        // Enqueue a work item for the late prepass phase.
        late_preprocess_work_items[output_work_item_index].input_index = input_index;
        late_preprocess_work_items[output_work_item_index].output_or_indirect_parameters_index =
            indirect_parameters_index;
#endif  // EARLY_PHASE
        // This mesh is culled. Skip it.
        return;
    }
#endif  // OCCLUSION_CULLING

    // Calculate inverse transpose.
    let local_from_world_transpose = transpose(maths::inverse_affine3(transpose(
        world_from_local_affine_transpose)));

    // Pack inverse transpose.
    let local_from_world_transpose_a = mat2x4<f32>(
        vec4<f32>(local_from_world_transpose[0].xyz, local_from_world_transpose[1].x),
        vec4<f32>(local_from_world_transpose[1].yz, local_from_world_transpose[2].xy));
    let local_from_world_transpose_b = local_from_world_transpose[2].z;

    // Figure out the output index. In indirect mode, this involves bumping the
    // instance index in the indirect parameters metadata, which
    // `build_indirect_params.wgsl` will use to generate the actual indirect
    // parameters. Otherwise, this index was directly supplied to us.
#ifdef INDIRECT
#ifdef LATE_PHASE
    let batch_output_index = atomicLoad(
        &indirect_parameters_gpu_metadata[indirect_parameters_index].early_instance_count
    ) + atomicAdd(
        &indirect_parameters_gpu_metadata[indirect_parameters_index].late_instance_count,
        1u
    );
#else   // LATE_PHASE
    let batch_output_index = atomicAdd(
        &indirect_parameters_gpu_metadata[indirect_parameters_index].early_instance_count,
        1u
    );
#endif  // LATE_PHASE

    let mesh_output_index =
        indirect_parameters_cpu_metadata[indirect_parameters_index].base_output_index +
        batch_output_index;

#endif  // INDIRECT

    // Write the output.
    output[mesh_output_index].world_from_local = world_from_local_affine_transpose;
    output[mesh_output_index].previous_world_from_local =
        previous_world_from_local_affine_transpose;
    output[mesh_output_index].local_from_world_transpose_a = local_from_world_transpose_a;
    output[mesh_output_index].local_from_world_transpose_b = local_from_world_transpose_b;
    output[mesh_output_index].flags = current_input[input_index].flags;
    output[mesh_output_index].lightmap_uv_rect = current_input[input_index].lightmap_uv_rect;
    output[mesh_output_index].first_vertex_index = current_input[input_index].first_vertex_index;
    output[mesh_output_index].current_skin_index = current_input[input_index].current_skin_index;
    output[mesh_output_index].material_and_lightmap_bind_group_slot =
        current_input[input_index].material_and_lightmap_bind_group_slot;
    output[mesh_output_index].tag = current_input[input_index].tag;
}