Use bit ops instead of integer modulo and divide in shaders

This commit is contained in:
atlas dostal 2025-07-06 20:03:27 -04:00
parent 537adcc3f7
commit e29d0d573b
13 changed files with 43 additions and 36 deletions

View File

@ -42,8 +42,8 @@ fn downsample_depth_first(
@builtin(workgroup_id) workgroup_id: vec3u,
@builtin(local_invocation_index) local_invocation_index: u32,
) {
let sub_xy = remap_for_wave_reduction(local_invocation_index % 64u);
let x = sub_xy.x + 8u * ((local_invocation_index >> 6u) % 2u);
let sub_xy = remap_for_wave_reduction(local_invocation_index & 63u);
let x = sub_xy.x + 8u * ((local_invocation_index >> 6u) & 1u);
let y = sub_xy.y + 8u * (local_invocation_index >> 7u);
downsample_mips_0_and_1(x, y, workgroup_id.xy, local_invocation_index);
@ -54,8 +54,8 @@ fn downsample_depth_first(
@compute
@workgroup_size(256, 1, 1)
fn downsample_depth_second(@builtin(local_invocation_index) local_invocation_index: u32) {
let sub_xy = remap_for_wave_reduction(local_invocation_index % 64u);
let x = sub_xy.x + 8u * ((local_invocation_index >> 6u) % 2u);
let sub_xy = remap_for_wave_reduction(local_invocation_index & 63u);
let x = sub_xy.x + 8u * ((local_invocation_index >> 6u) & 1u);
let y = sub_xy.y + 8u * (local_invocation_index >> 7u);
downsample_mips_6_and_7(x, y);
@ -99,8 +99,8 @@ fn downsample_mips_0_and_1(x: u32, y: u32, workgroup_id: vec2u, local_invocation
intermediate_memory[x * 2u + 1u][y * 2u + 1u],
));
pix = (workgroup_id * 16u) + vec2(
x + (i % 2u) * 8u,
y + (i / 2u) * 8u,
x + (i & 1u) * 8u,
y + (i >> 1u) * 8u,
);
textureStore(mip_2, pix, vec4(v[i]));
}
@ -142,7 +142,7 @@ fn downsample_mip_2(x: u32, y: u32, workgroup_id: vec2u, local_invocation_index:
intermediate_memory[x * 2u + 1u][y * 2u + 1u],
));
textureStore(mip_3, (workgroup_id * 8u) + vec2(x, y), vec4(v));
intermediate_memory[x * 2u + y % 2u][y * 2u] = v;
intermediate_memory[x * 2u + (y & 1u)][y * 2u] = v;
}
}
@ -241,7 +241,7 @@ fn downsample_mip_8(x: u32, y: u32, local_invocation_index: u32) {
intermediate_memory[x * 2u + 1u][y * 2u + 1u],
));
textureStore(mip_9, vec2(x, y), vec4(v));
intermediate_memory[x * 2u + y % 2u][y * 2u] = v;
intermediate_memory[x * 2u + (y & 1u)][y * 2u] = v;
}
}

View File

@ -204,8 +204,8 @@ var<push_constant> constants: Constants;
// TODO: Load only twice, instead of 3x in cases where you load 3 indices per thread?
fn get_meshlet_vertex_id(index_id: u32) -> u32 {
let packed_index = meshlet_indices[index_id / 4u];
let bit_offset = (index_id % 4u) * 8u;
let packed_index = meshlet_indices[index_id >> 2u];
let bit_offset = (index_id & 3u) * 8u;
return extractBits(packed_index, bit_offset, 8u);
}
@ -219,7 +219,7 @@ fn get_meshlet_vertex_position(meshlet: ptr<function, Meshlet>, vertex_id: u32)
// Read each vertex channel from the bitstream
var vertex_position_packed = vec3(0u);
for (var i = 0u; i < 3u; i++) {
let lower_word_index = start_bit / 32u;
let lower_word_index = start_bit >> 5u;
let lower_word_bit_offset = start_bit & 31u;
var next_32_bits = meshlet_vertex_positions[lower_word_index] >> lower_word_bit_offset;
if lower_word_bit_offset + bits_per_channel[i] > 32u {
@ -256,8 +256,8 @@ fn get_meshlet_vertex_position(meshlet: ptr<function, Meshlet>, vertex_id: u32)
// TODO: Load only twice, instead of 3x in cases where you load 3 indices per thread?
fn get_meshlet_vertex_id(index_id: u32) -> u32 {
let packed_index = meshlet_indices[index_id / 4u];
let bit_offset = (index_id % 4u) * 8u;
let packed_index = meshlet_indices[index_id >> 2u];
let bit_offset = (index_id & 3u) * 8u;
return extractBits(packed_index, bit_offset, 8u);
}
@ -271,7 +271,7 @@ fn get_meshlet_vertex_position(meshlet: ptr<function, Meshlet>, vertex_id: u32)
// Read each vertex channel from the bitstream
var vertex_position_packed = vec3(0u);
for (var i = 0u; i < 3u; i++) {
let lower_word_index = start_bit / 32u;
let lower_word_index = start_bit >> 5u;
let lower_word_bit_offset = start_bit & 31u;
var next_32_bits = meshlet_vertex_positions[lower_word_index] >> lower_word_bit_offset;
if lower_word_bit_offset + bits_per_channel[i] > 32u {

View File

@ -8,8 +8,8 @@
@vertex
fn vertex(@builtin(vertex_index) vertex_input: u32) -> @builtin(position) vec4<f32> {
let vertex_index = vertex_input % 3u;
let material_id = vertex_input / 3u;
let vertex_index = vertex_input - material_id * 3u;
let material_depth = f32(material_id) / 65535.0;
let uv = vec2<f32>(vec2(vertex_index >> 1u, vertex_index & 1u)) * 2.0;
return vec4(uv_to_ndc(uv), material_depth, 1.0);

View File

@ -129,9 +129,9 @@ fn get_clusterable_object_id(index: u32) -> u32 {
// The index is correct but in clusterable_object_index_lists we pack 4 u8s into a u32
// This means the index into clusterable_object_index_lists is index / 4
let indices = bindings::clusterable_object_index_lists.data[index >> 4u][(index >> 2u) &
((1u << 2u) - 1u)];
3u];
// And index % 4 gives the sub-index of the u8 within the u32 so we shift by 8 * sub-index
return (indices >> (8u * (index & ((1u << 2u) - 1u)))) & ((1u << 8u) - 1u);
return (indices >> (8u * (index & 3u))) & 255u;
#endif
}
@ -151,7 +151,7 @@ fn cluster_debug_visualization(
var z_slice: u32 = view_z_to_z_slice(view_z, is_orthographic);
// A hack to make the colors alternate a bit more
if (z_slice & 1u) == 1u {
z_slice = z_slice + bindings::lights.cluster_dimensions.z / 2u;
z_slice = z_slice + bindings::lights.cluster_dimensions.z >> 1u;
}
let slice_color_hsv = vec3(
f32(z_slice) / f32(bindings::lights.cluster_dimensions.z + 1u) * PI_2,

View File

@ -303,7 +303,7 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
// this frame.
let output_work_item_index = atomicAdd(&late_preprocess_work_item_indirect_parameters[
push_constants.late_preprocess_work_item_indirect_offset].work_item_count, 1u);
if (output_work_item_index % 64u == 0u) {
if ((output_work_item_index & 63u) == 0u) {
// Our workgroup size is 64, and the indirect parameters for the
// late mesh preprocessing phase are counted in workgroups, so if
// we're the first thread in this workgroup, bump the workgroup

View File

@ -28,11 +28,11 @@ fn component_texture_coord(vertex_index: u32, component_offset: u32) -> vec2<u32
}
fn weight_at(weight_index: u32) -> f32 {
let i = weight_index;
return morph_weights.weights[i / 4u][i % 4u];
return morph_weights.weights[i >> 2u][i & 3u];
}
fn prev_weight_at(weight_index: u32) -> f32 {
let i = weight_index;
return prev_morph_weights.weights[i / 4u][i % 4u];
return prev_morph_weights.weights[i >> 2u][i & 3u];
}
fn morph_pixel(vertex: u32, component: u32, weight: u32) -> f32 {
let coord = component_texture_coord(vertex, component);

View File

@ -85,7 +85,7 @@ fn visibility_range_dither(frag_coord: vec4<f32>, dither: i32) {
}
// Otherwise, check the dither pattern.
let coords = vec2<u32>(floor(frag_coord.xy)) % 4u;
let coords = vec2<u32>(floor(frag_coord.xy)) & 3u;
let threshold = i32((DITHER_THRESHOLD_MAP[coords.y] >> (coords.x * 8)) & 0xff);
if ((dither >= 0 && dither + threshold >= 16) || (dither < 0 && 1 + dither + threshold <= 0)) {
discard;

View File

@ -112,10 +112,10 @@ fn fetch_transmissive_background(offset_position: vec2<f32>, frag_coord: vec3<f3
let pixel_checkboard = (
#ifdef TEMPORAL_JITTER
// 0 or 1 on even/odd pixels, alternates every frame
(i32(frag_coord.x) + i32(frag_coord.y) + i32(view_bindings::globals.frame_count)) % 2
(i32(frag_coord.x) + i32(frag_coord.y) + i32(view_bindings::globals.frame_count)) & 1
#else
// 0 or 1 on even/odd pixels
(i32(frag_coord.x) + i32(frag_coord.y)) % 2
(i32(frag_coord.x) + i32(frag_coord.y)) & 1
#endif
);

View File

@ -70,7 +70,7 @@ fn octahedral_decode_signed(v: vec2<f32>) -> vec3<f32> {
// https://blog.demofox.org/2022/01/01/interleaved-gradient-noise-a-different-kind-of-low-discrepancy-sequence
fn interleaved_gradient_noise(pixel_coordinates: vec2<f32>, frame: u32) -> f32 {
let xy = pixel_coordinates + 5.588238 * f32(frame % 64u);
let xy = pixel_coordinates + 5.588238 * f32(frame & 63u);
return fract(52.9829189 * fract(0.06711056 * xy.x + 0.00583715 * xy.y));
}

View File

@ -65,38 +65,38 @@ fn preprocess_depth(@builtin(global_invocation_id) global_id: vec3<u32>, @builti
workgroupBarrier();
// MIP 2 - Weighted average of MIP 1's depth values (per invocation, 4x4 invocations per workgroup)
if all(local_id.xy % vec2<u32>(2u) == vec2<u32>(0u)) {
if all((local_id.xy & vec2<u32>(1u)) == vec2<u32>(0u)) {
let depth0 = previous_mip_depth[local_id.x + 0u][local_id.y + 0u];
let depth1 = previous_mip_depth[local_id.x + 1u][local_id.y + 0u];
let depth2 = previous_mip_depth[local_id.x + 0u][local_id.y + 1u];
let depth3 = previous_mip_depth[local_id.x + 1u][local_id.y + 1u];
let depth_mip2 = weighted_average(depth0, depth1, depth2, depth3);
textureStore(preprocessed_depth_mip2, base_coordinates / 2i, vec4<f32>(depth_mip2, 0.0, 0.0, 0.0));
textureStore(preprocessed_depth_mip2, base_coordinates >> vec2<u32>(1u), vec4<f32>(depth_mip2, 0.0, 0.0, 0.0));
previous_mip_depth[local_id.x][local_id.y] = depth_mip2;
}
workgroupBarrier();
// MIP 3 - Weighted average of MIP 2's depth values (per invocation, 2x2 invocations per workgroup)
if all(local_id.xy % vec2<u32>(4u) == vec2<u32>(0u)) {
if all((local_id.xy & vec2<u32>(3u)) == vec2<u32>(0u)) {
let depth0 = previous_mip_depth[local_id.x + 0u][local_id.y + 0u];
let depth1 = previous_mip_depth[local_id.x + 2u][local_id.y + 0u];
let depth2 = previous_mip_depth[local_id.x + 0u][local_id.y + 2u];
let depth3 = previous_mip_depth[local_id.x + 2u][local_id.y + 2u];
let depth_mip3 = weighted_average(depth0, depth1, depth2, depth3);
textureStore(preprocessed_depth_mip3, base_coordinates / 4i, vec4<f32>(depth_mip3, 0.0, 0.0, 0.0));
textureStore(preprocessed_depth_mip3, base_coordinates >> vec2<u32>(2u), vec4<f32>(depth_mip3, 0.0, 0.0, 0.0));
previous_mip_depth[local_id.x][local_id.y] = depth_mip3;
}
workgroupBarrier();
// MIP 4 - Weighted average of MIP 3's depth values (per invocation, 1 invocation per workgroup)
if all(local_id.xy % vec2<u32>(8u) == vec2<u32>(0u)) {
if all((local_id.xy & vec2<u32>(7u)) == vec2<u32>(0u)) {
let depth0 = previous_mip_depth[local_id.x + 0u][local_id.y + 0u];
let depth1 = previous_mip_depth[local_id.x + 4u][local_id.y + 0u];
let depth2 = previous_mip_depth[local_id.x + 0u][local_id.y + 4u];
let depth3 = previous_mip_depth[local_id.x + 4u][local_id.y + 4u];
let depth_mip4 = weighted_average(depth0, depth1, depth2, depth3);
textureStore(preprocessed_depth_mip4, base_coordinates / 8i, vec4<f32>(depth_mip4, 0.0, 0.0, 0.0));
textureStore(preprocessed_depth_mip4, base_coordinates >> vec2<u32>(3u), vec4<f32>(depth_mip4, 0.0, 0.0, 0.0));
}
}

View File

@ -30,10 +30,10 @@
@group(1) @binding(2) var<uniform> view: View;
fn load_noise(pixel_coordinates: vec2<i32>) -> vec2<f32> {
var index = textureLoad(hilbert_index_lut, pixel_coordinates % 64, 0).r;
var index = textureLoad(hilbert_index_lut, pixel_coordinates & vec2<i32>(63), 0).r;
#ifdef TEMPORAL_JITTER
index += 288u * (globals.frame_count % 64u);
index += 288u * (globals.frame_count & 63u);
#endif
// R2 sequence - http://extremelearning.com.au/unreasonable-effectiveness-of-quasirandom-sequences

View File

@ -103,6 +103,13 @@ fn henyey_greenstein(neg_LdotV: f32) -> f32 {
return FRAC_4_PI * (1.0 - g * g) / (denom * sqrt(denom));
}
fn simple_wrap_3(index: i32) -> i32 {
if (index >= 3) {
return index - 3;
}
return index;
}
@fragment
fn fragment(@builtin(position) position: vec4<f32>) -> @location(0) vec4<f32> {
// Unpack the `volumetric_fog` settings.
@ -140,8 +147,8 @@ fn fragment(@builtin(position) position: vec4<f32>) -> @location(0) vec4<f32> {
var end_depth_view = 0.0;
for (var plane_index = 0; plane_index < 3; plane_index += 1) {
let plane = volumetric_fog.far_planes[plane_index];
let other_plane_a = volumetric_fog.far_planes[(plane_index + 1) % 3];
let other_plane_b = volumetric_fog.far_planes[(plane_index + 2) % 3];
let other_plane_a = volumetric_fog.far_planes[simple_wrap_3(plane_index + 1)];
let other_plane_b = volumetric_fog.far_planes[simple_wrap_3(plane_index + 2)];
// Calculate the intersection of the ray and the plane. The ray must
// intersect in front of us (t > 0).

View File

@ -32,7 +32,7 @@ fn vertex(vertex: Vertex) -> VertexOutput {
out.position = mesh_functions::mesh2d_position_world_to_clip(world_position);
out.uv = vertex.uv;
out.tile_index = vertex.vertex_index / 4u;
out.tile_index = vertex.vertex_index >> 2u;
return out;
}