Use bit ops instead of integer modulo and divide in shaders

2025-07-06 20:03:27 -04:00 · 2025-07-06 20:03:27 -04:00 · e29d0d573b
commit e29d0d573b
parent 537adcc3f7
13 changed files with 43 additions and 36 deletions
--- a/crates/bevy_core_pipeline/src/experimental/mip_generation/downsample_depth.wgsl
+++ b/crates/bevy_core_pipeline/src/experimental/mip_generation/downsample_depth.wgsl
@ -42,8 +42,8 @@ fn downsample_depth_first(
    @builtin(workgroup_id) workgroup_id: vec3u,
    @builtin(local_invocation_index) local_invocation_index: u32,
 ) {
-    let sub_xy = remap_for_wave_reduction(local_invocation_index % 64u);
-    let x = sub_xy.x + 8u * ((local_invocation_index >> 6u) % 2u);
+    let sub_xy = remap_for_wave_reduction(local_invocation_index & 63u);
+    let x = sub_xy.x + 8u * ((local_invocation_index >> 6u) & 1u);
    let y = sub_xy.y + 8u * (local_invocation_index >> 7u);

    downsample_mips_0_and_1(x, y, workgroup_id.xy, local_invocation_index);
@ -54,8 +54,8 @@ fn downsample_depth_first(
@compute
@workgroup_size(256, 1, 1)
 fn downsample_depth_second(@builtin(local_invocation_index) local_invocation_index: u32) {
-    let sub_xy = remap_for_wave_reduction(local_invocation_index % 64u);
-    let x = sub_xy.x + 8u * ((local_invocation_index >> 6u) % 2u);
+    let sub_xy = remap_for_wave_reduction(local_invocation_index & 63u);
+    let x = sub_xy.x + 8u * ((local_invocation_index >> 6u) & 1u);
    let y = sub_xy.y + 8u * (local_invocation_index >> 7u);

    downsample_mips_6_and_7(x, y);
@ -99,8 +99,8 @@ fn downsample_mips_0_and_1(x: u32, y: u32, workgroup_id: vec2u, local_invocation
                intermediate_memory[x * 2u + 1u][y * 2u + 1u],
            ));
            pix = (workgroup_id * 16u) + vec2(
-                x + (i % 2u) * 8u,
-                y + (i / 2u) * 8u,
+                x + (i & 1u) * 8u,
+                y + (i >> 1u) * 8u,
            );
            textureStore(mip_2, pix, vec4(v[i]));
        }
@ -142,7 +142,7 @@ fn downsample_mip_2(x: u32, y: u32, workgroup_id: vec2u, local_invocation_index:
            intermediate_memory[x * 2u + 1u][y * 2u + 1u],
        ));
        textureStore(mip_3, (workgroup_id * 8u) + vec2(x, y), vec4(v));
-        intermediate_memory[x * 2u + y % 2u][y * 2u] = v;
+        intermediate_memory[x * 2u + (y & 1u)][y * 2u] = v;
    }
 }

@ -241,7 +241,7 @@ fn downsample_mip_8(x: u32, y: u32, local_invocation_index: u32) {
            intermediate_memory[x * 2u + 1u][y * 2u + 1u],
        ));
        textureStore(mip_9, vec2(x, y), vec4(v));
-        intermediate_memory[x * 2u + y % 2u][y * 2u] = v;
+        intermediate_memory[x * 2u + (y & 1u)][y * 2u] = v;
    }
 }

--- a/crates/bevy_pbr/src/meshlet/meshlet_bindings.wgsl
+++ b/crates/bevy_pbr/src/meshlet/meshlet_bindings.wgsl
@ -204,8 +204,8 @@ var<push_constant> constants: Constants;

 // TODO: Load only twice, instead of 3x in cases where you load 3 indices per thread?
 fn get_meshlet_vertex_id(index_id: u32) -> u32 {
-    let packed_index = meshlet_indices[index_id / 4u];
-    let bit_offset = (index_id % 4u) * 8u;
+    let packed_index = meshlet_indices[index_id >> 2u];
+    let bit_offset = (index_id & 3u) * 8u;
    return extractBits(packed_index, bit_offset, 8u);
 }

@ -219,7 +219,7 @@ fn get_meshlet_vertex_position(meshlet: ptr<function, Meshlet>, vertex_id: u32)
    // Read each vertex channel from the bitstream
    var vertex_position_packed = vec3(0u);
    for (var i = 0u; i < 3u; i++) {
-        let lower_word_index = start_bit / 32u;
+        let lower_word_index = start_bit >> 5u;
        let lower_word_bit_offset = start_bit & 31u;
        var next_32_bits = meshlet_vertex_positions[lower_word_index] >> lower_word_bit_offset;
        if lower_word_bit_offset + bits_per_channel[i] > 32u {
@ -256,8 +256,8 @@ fn get_meshlet_vertex_position(meshlet: ptr<function, Meshlet>, vertex_id: u32)

 // TODO: Load only twice, instead of 3x in cases where you load 3 indices per thread?
 fn get_meshlet_vertex_id(index_id: u32) -> u32 {
-    let packed_index = meshlet_indices[index_id / 4u];
-    let bit_offset = (index_id % 4u) * 8u;
+    let packed_index = meshlet_indices[index_id >> 2u];
+    let bit_offset = (index_id & 3u) * 8u;
    return extractBits(packed_index, bit_offset, 8u);
 }

@ -271,7 +271,7 @@ fn get_meshlet_vertex_position(meshlet: ptr<function, Meshlet>, vertex_id: u32)
    // Read each vertex channel from the bitstream
    var vertex_position_packed = vec3(0u);
    for (var i = 0u; i < 3u; i++) {
-        let lower_word_index = start_bit / 32u;
+        let lower_word_index = start_bit >> 5u;
        let lower_word_bit_offset = start_bit & 31u;
        var next_32_bits = meshlet_vertex_positions[lower_word_index] >> lower_word_bit_offset;
        if lower_word_bit_offset + bits_per_channel[i] > 32u {
--- a/crates/bevy_pbr/src/meshlet/meshlet_mesh_material.wgsl
+++ b/crates/bevy_pbr/src/meshlet/meshlet_mesh_material.wgsl
@ -8,8 +8,8 @@

@vertex
 fn vertex(@builtin(vertex_index) vertex_input: u32) -> @builtin(position) vec4<f32> {
-    let vertex_index = vertex_input % 3u;
    let material_id = vertex_input / 3u;
+    let vertex_index = vertex_input - material_id * 3u;
    let material_depth = f32(material_id) / 65535.0;
    let uv = vec2<f32>(vec2(vertex_index >> 1u, vertex_index & 1u)) * 2.0;
    return vec4(uv_to_ndc(uv), material_depth, 1.0);
--- a/crates/bevy_pbr/src/render/clustered_forward.wgsl
+++ b/crates/bevy_pbr/src/render/clustered_forward.wgsl
@ -129,9 +129,9 @@ fn get_clusterable_object_id(index: u32) -> u32 {
    // The index is correct but in clusterable_object_index_lists we pack 4 u8s into a u32
    // This means the index into clusterable_object_index_lists is index / 4
    let indices = bindings::clusterable_object_index_lists.data[index >> 4u][(index >> 2u) &
-        ((1u << 2u) - 1u)];
+        3u];
    // And index % 4 gives the sub-index of the u8 within the u32 so we shift by 8 * sub-index
-    return (indices >> (8u * (index & ((1u << 2u) - 1u)))) & ((1u << 8u) - 1u);
+    return (indices >> (8u * (index & 3u))) & 255u;
 #endif
 }

@ -151,7 +151,7 @@ fn cluster_debug_visualization(
    var z_slice: u32 = view_z_to_z_slice(view_z, is_orthographic);
    // A hack to make the colors alternate a bit more
    if (z_slice & 1u) == 1u {
-        z_slice = z_slice + bindings::lights.cluster_dimensions.z / 2u;
+        z_slice = z_slice + bindings::lights.cluster_dimensions.z >> 1u;
    }
    let slice_color_hsv = vec3(
        f32(z_slice) / f32(bindings::lights.cluster_dimensions.z + 1u) * PI_2,
--- a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
+++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
@ -303,7 +303,7 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
        // this frame.
        let output_work_item_index = atomicAdd(&late_preprocess_work_item_indirect_parameters[
            push_constants.late_preprocess_work_item_indirect_offset].work_item_count, 1u);
-        if (output_work_item_index % 64u == 0u) {
+        if ((output_work_item_index & 63u) == 0u) {
            // Our workgroup size is 64, and the indirect parameters for the
            // late mesh preprocessing phase are counted in workgroups, so if
            // we're the first thread in this workgroup, bump the workgroup
--- a/crates/bevy_pbr/src/render/morph.wgsl
+++ b/crates/bevy_pbr/src/render/morph.wgsl
@ -28,11 +28,11 @@ fn component_texture_coord(vertex_index: u32, component_offset: u32) -> vec2<u32
 }
 fn weight_at(weight_index: u32) -> f32 {
    let i = weight_index;
-    return morph_weights.weights[i / 4u][i % 4u];
+    return morph_weights.weights[i >> 2u][i & 3u];
 }
 fn prev_weight_at(weight_index: u32) -> f32 {
    let i = weight_index;
-    return prev_morph_weights.weights[i / 4u][i % 4u];
+    return prev_morph_weights.weights[i >> 2u][i & 3u];
 }
 fn morph_pixel(vertex: u32, component: u32, weight: u32) -> f32 {
    let coord = component_texture_coord(vertex, component);
--- a/crates/bevy_pbr/src/render/pbr_functions.wgsl
+++ b/crates/bevy_pbr/src/render/pbr_functions.wgsl
@ -85,7 +85,7 @@ fn visibility_range_dither(frag_coord: vec4<f32>, dither: i32) {
    }

    // Otherwise, check the dither pattern.
-    let coords = vec2<u32>(floor(frag_coord.xy)) % 4u;
+    let coords = vec2<u32>(floor(frag_coord.xy)) & 3u;
    let threshold = i32((DITHER_THRESHOLD_MAP[coords.y] >> (coords.x * 8)) & 0xff);
    if ((dither >= 0 && dither + threshold >= 16) || (dither < 0 && 1 + dither + threshold <= 0)) {
        discard;
--- a/crates/bevy_pbr/src/render/pbr_transmission.wgsl
+++ b/crates/bevy_pbr/src/render/pbr_transmission.wgsl
@ -112,10 +112,10 @@ fn fetch_transmissive_background(offset_position: vec2<f32>, frag_coord: vec3<f3
    let pixel_checkboard = (
 #ifdef TEMPORAL_JITTER
        // 0 or 1 on even/odd pixels, alternates every frame
-        (i32(frag_coord.x) + i32(frag_coord.y) + i32(view_bindings::globals.frame_count)) % 2
+        (i32(frag_coord.x) + i32(frag_coord.y) + i32(view_bindings::globals.frame_count)) & 1
 #else
        // 0 or 1 on even/odd pixels
-        (i32(frag_coord.x) + i32(frag_coord.y)) % 2
+        (i32(frag_coord.x) + i32(frag_coord.y)) & 1
 #endif
    );

--- a/crates/bevy_pbr/src/render/utils.wgsl
+++ b/crates/bevy_pbr/src/render/utils.wgsl
@ -70,7 +70,7 @@ fn octahedral_decode_signed(v: vec2<f32>) -> vec3<f32> {

 // https://blog.demofox.org/2022/01/01/interleaved-gradient-noise-a-different-kind-of-low-discrepancy-sequence
 fn interleaved_gradient_noise(pixel_coordinates: vec2<f32>, frame: u32) -> f32 {
-    let xy = pixel_coordinates + 5.588238 * f32(frame % 64u);
+    let xy = pixel_coordinates + 5.588238 * f32(frame & 63u);
    return fract(52.9829189 * fract(0.06711056 * xy.x + 0.00583715 * xy.y));
 }

--- a/crates/bevy_pbr/src/ssao/preprocess_depth.wgsl
+++ b/crates/bevy_pbr/src/ssao/preprocess_depth.wgsl
@ -65,38 +65,38 @@ fn preprocess_depth(@builtin(global_invocation_id) global_id: vec3<u32>, @builti
    workgroupBarrier();

    // MIP 2 - Weighted average of MIP 1's depth values (per invocation, 4x4 invocations per workgroup)
-    if all(local_id.xy % vec2<u32>(2u) == vec2<u32>(0u)) {
+    if all((local_id.xy & vec2<u32>(1u)) == vec2<u32>(0u)) {
        let depth0 = previous_mip_depth[local_id.x + 0u][local_id.y + 0u];
        let depth1 = previous_mip_depth[local_id.x + 1u][local_id.y + 0u];
        let depth2 = previous_mip_depth[local_id.x + 0u][local_id.y + 1u];
        let depth3 = previous_mip_depth[local_id.x + 1u][local_id.y + 1u];
        let depth_mip2 = weighted_average(depth0, depth1, depth2, depth3);
-        textureStore(preprocessed_depth_mip2, base_coordinates / 2i, vec4<f32>(depth_mip2, 0.0, 0.0, 0.0));
+        textureStore(preprocessed_depth_mip2, base_coordinates >> vec2<u32>(1u), vec4<f32>(depth_mip2, 0.0, 0.0, 0.0));
        previous_mip_depth[local_id.x][local_id.y] = depth_mip2;
    }

    workgroupBarrier();

    // MIP 3 - Weighted average of MIP 2's depth values (per invocation, 2x2 invocations per workgroup)
-    if all(local_id.xy % vec2<u32>(4u) == vec2<u32>(0u)) {
+    if all((local_id.xy & vec2<u32>(3u)) == vec2<u32>(0u)) {
        let depth0 = previous_mip_depth[local_id.x + 0u][local_id.y + 0u];
        let depth1 = previous_mip_depth[local_id.x + 2u][local_id.y + 0u];
        let depth2 = previous_mip_depth[local_id.x + 0u][local_id.y + 2u];
        let depth3 = previous_mip_depth[local_id.x + 2u][local_id.y + 2u];
        let depth_mip3 = weighted_average(depth0, depth1, depth2, depth3);
-        textureStore(preprocessed_depth_mip3, base_coordinates / 4i, vec4<f32>(depth_mip3, 0.0, 0.0, 0.0));
+        textureStore(preprocessed_depth_mip3, base_coordinates >> vec2<u32>(2u), vec4<f32>(depth_mip3, 0.0, 0.0, 0.0));
        previous_mip_depth[local_id.x][local_id.y] = depth_mip3;
    }

    workgroupBarrier();

    // MIP 4 - Weighted average of MIP 3's depth values (per invocation, 1 invocation per workgroup)
-    if all(local_id.xy % vec2<u32>(8u) == vec2<u32>(0u)) {
+    if all((local_id.xy & vec2<u32>(7u)) == vec2<u32>(0u)) {
        let depth0 = previous_mip_depth[local_id.x + 0u][local_id.y + 0u];
        let depth1 = previous_mip_depth[local_id.x + 4u][local_id.y + 0u];
        let depth2 = previous_mip_depth[local_id.x + 0u][local_id.y + 4u];
        let depth3 = previous_mip_depth[local_id.x + 4u][local_id.y + 4u];
        let depth_mip4 = weighted_average(depth0, depth1, depth2, depth3);
-        textureStore(preprocessed_depth_mip4, base_coordinates / 8i, vec4<f32>(depth_mip4, 0.0, 0.0, 0.0));
+        textureStore(preprocessed_depth_mip4, base_coordinates >> vec2<u32>(3u), vec4<f32>(depth_mip4, 0.0, 0.0, 0.0));
    }
 }
--- a/crates/bevy_pbr/src/ssao/ssao.wgsl
+++ b/crates/bevy_pbr/src/ssao/ssao.wgsl
@ -30,10 +30,10 @@
@group(1) @binding(2) var<uniform> view: View;

 fn load_noise(pixel_coordinates: vec2<i32>) -> vec2<f32> {
-    var index = textureLoad(hilbert_index_lut, pixel_coordinates % 64, 0).r;
+    var index = textureLoad(hilbert_index_lut, pixel_coordinates & vec2<i32>(63), 0).r;

 #ifdef TEMPORAL_JITTER
-    index += 288u * (globals.frame_count % 64u);
+    index += 288u * (globals.frame_count & 63u);
 #endif

    // R2 sequence - http://extremelearning.com.au/unreasonable-effectiveness-of-quasirandom-sequences
--- a/crates/bevy_pbr/src/volumetric_fog/volumetric_fog.wgsl
+++ b/crates/bevy_pbr/src/volumetric_fog/volumetric_fog.wgsl
@ -103,6 +103,13 @@ fn henyey_greenstein(neg_LdotV: f32) -> f32 {
    return FRAC_4_PI * (1.0 - g * g) / (denom * sqrt(denom));
 }

+fn simple_wrap_3(index: i32) -> i32 {
+    if (index >= 3) {
+        return index - 3;
+    }
+    return index;
+}
+
@fragment
 fn fragment(@builtin(position) position: vec4<f32>) -> @location(0) vec4<f32> {
    // Unpack the `volumetric_fog` settings.
@ -140,8 +147,8 @@ fn fragment(@builtin(position) position: vec4<f32>) -> @location(0) vec4<f32> {
    var end_depth_view = 0.0;
    for (var plane_index = 0; plane_index < 3; plane_index += 1) {
        let plane = volumetric_fog.far_planes[plane_index];
-        let other_plane_a = volumetric_fog.far_planes[(plane_index + 1) % 3];
-        let other_plane_b = volumetric_fog.far_planes[(plane_index + 2) % 3];
+        let other_plane_a = volumetric_fog.far_planes[simple_wrap_3(plane_index + 1)];
+        let other_plane_b = volumetric_fog.far_planes[simple_wrap_3(plane_index + 2)];

        // Calculate the intersection of the ray and the plane. The ray must
        // intersect in front of us (t > 0).
--- a/crates/bevy_sprite/src/tilemap_chunk/tilemap_chunk_material.wgsl
+++ b/crates/bevy_sprite/src/tilemap_chunk/tilemap_chunk_material.wgsl
@ -32,7 +32,7 @@ fn vertex(vertex: Vertex) -> VertexOutput {

    out.position = mesh_functions::mesh2d_position_world_to_clip(world_position);
    out.uv = vertex.uv;
-    out.tile_index = vertex.vertex_index / 4u;
+    out.tile_index = vertex.vertex_index >> 2u;

    return out;
 }