From e29d0d573bd5cf1bf8068735a0d7b3a969de9145 Mon Sep 17 00:00:00 2001
From: atlas dostal <rodol@rivalrebels.com>
Date: Sun, 6 Jul 2025 20:03:27 -0400
Subject: [PATCH] Use bit ops instead of integer modulo and divide in shaders

---
 .../mip_generation/downsample_depth.wgsl         | 16 ++++++++--------
 .../bevy_pbr/src/meshlet/meshlet_bindings.wgsl   | 12 ++++++------
 .../src/meshlet/meshlet_mesh_material.wgsl       |  2 +-
 .../bevy_pbr/src/render/clustered_forward.wgsl   |  6 +++---
 crates/bevy_pbr/src/render/mesh_preprocess.wgsl  |  2 +-
 crates/bevy_pbr/src/render/morph.wgsl            |  4 ++--
 crates/bevy_pbr/src/render/pbr_functions.wgsl    |  2 +-
 crates/bevy_pbr/src/render/pbr_transmission.wgsl |  4 ++--
 crates/bevy_pbr/src/render/utils.wgsl            |  2 +-
 crates/bevy_pbr/src/ssao/preprocess_depth.wgsl   | 12 ++++++------
 crates/bevy_pbr/src/ssao/ssao.wgsl               |  4 ++--
 .../src/volumetric_fog/volumetric_fog.wgsl       | 11 +++++++++--
 .../tilemap_chunk/tilemap_chunk_material.wgsl    |  2 +-
 13 files changed, 43 insertions(+), 36 deletions(-)

diff --git a/crates/bevy_core_pipeline/src/experimental/mip_generation/downsample_depth.wgsl b/crates/bevy_core_pipeline/src/experimental/mip_generation/downsample_depth.wgsl
index 12a4d2b178bff..79104f4f93ab0 100644
--- a/crates/bevy_core_pipeline/src/experimental/mip_generation/downsample_depth.wgsl
+++ b/crates/bevy_core_pipeline/src/experimental/mip_generation/downsample_depth.wgsl
@@ -42,8 +42,8 @@ fn downsample_depth_first(
     @builtin(workgroup_id) workgroup_id: vec3u,
     @builtin(local_invocation_index) local_invocation_index: u32,
 ) {
-    let sub_xy = remap_for_wave_reduction(local_invocation_index % 64u);
-    let x = sub_xy.x + 8u * ((local_invocation_index >> 6u) % 2u);
+    let sub_xy = remap_for_wave_reduction(local_invocation_index & 63u);
+    let x = sub_xy.x + 8u * ((local_invocation_index >> 6u) & 1u);
     let y = sub_xy.y + 8u * (local_invocation_index >> 7u);
 
     downsample_mips_0_and_1(x, y, workgroup_id.xy, local_invocation_index);
@@ -54,8 +54,8 @@ fn downsample_depth_first(
 @compute
 @workgroup_size(256, 1, 1)
 fn downsample_depth_second(@builtin(local_invocation_index) local_invocation_index: u32) {
-    let sub_xy = remap_for_wave_reduction(local_invocation_index % 64u);
-    let x = sub_xy.x + 8u * ((local_invocation_index >> 6u) % 2u);
+    let sub_xy = remap_for_wave_reduction(local_invocation_index & 63u);
+    let x = sub_xy.x + 8u * ((local_invocation_index >> 6u) & 1u);
     let y = sub_xy.y + 8u * (local_invocation_index >> 7u);
 
     downsample_mips_6_and_7(x, y);
@@ -99,8 +99,8 @@ fn downsample_mips_0_and_1(x: u32, y: u32, workgroup_id: vec2u, local_invocation
                 intermediate_memory[x * 2u + 1u][y * 2u + 1u],
             ));
             pix = (workgroup_id * 16u) + vec2(
-                x + (i % 2u) * 8u,
-                y + (i / 2u) * 8u,
+                x + (i & 1u) * 8u,
+                y + (i >> 1u) * 8u,
             );
             textureStore(mip_2, pix, vec4(v[i]));
         }
@@ -142,7 +142,7 @@ fn downsample_mip_2(x: u32, y: u32, workgroup_id: vec2u, local_invocation_index:
             intermediate_memory[x * 2u + 1u][y * 2u + 1u],
         ));
         textureStore(mip_3, (workgroup_id * 8u) + vec2(x, y), vec4(v));
-        intermediate_memory[x * 2u + y % 2u][y * 2u] = v;
+        intermediate_memory[x * 2u + (y & 1u)][y * 2u] = v;
     }
 }
 
@@ -241,7 +241,7 @@ fn downsample_mip_8(x: u32, y: u32, local_invocation_index: u32) {
             intermediate_memory[x * 2u + 1u][y * 2u + 1u],
         ));
         textureStore(mip_9, vec2(x, y), vec4(v));
-        intermediate_memory[x * 2u + y % 2u][y * 2u] = v;
+        intermediate_memory[x * 2u + (y & 1u)][y * 2u] = v;
     }
 }
 
diff --git a/crates/bevy_pbr/src/meshlet/meshlet_bindings.wgsl b/crates/bevy_pbr/src/meshlet/meshlet_bindings.wgsl
index 4533b2bd7fa7e..e91314dd11fce 100644
--- a/crates/bevy_pbr/src/meshlet/meshlet_bindings.wgsl
+++ b/crates/bevy_pbr/src/meshlet/meshlet_bindings.wgsl
@@ -204,8 +204,8 @@ var<push_constant> constants: Constants;
 
 // TODO: Load only twice, instead of 3x in cases where you load 3 indices per thread?
 fn get_meshlet_vertex_id(index_id: u32) -> u32 {
-    let packed_index = meshlet_indices[index_id / 4u];
-    let bit_offset = (index_id % 4u) * 8u;
+    let packed_index = meshlet_indices[index_id >> 2u];
+    let bit_offset = (index_id & 3u) * 8u;
     return extractBits(packed_index, bit_offset, 8u);
 }
 
@@ -219,7 +219,7 @@ fn get_meshlet_vertex_position(meshlet: ptr<function, Meshlet>, vertex_id: u32)
     // Read each vertex channel from the bitstream
     var vertex_position_packed = vec3(0u);
     for (var i = 0u; i < 3u; i++) {
-        let lower_word_index = start_bit / 32u;
+        let lower_word_index = start_bit >> 5u;
         let lower_word_bit_offset = start_bit & 31u;
         var next_32_bits = meshlet_vertex_positions[lower_word_index] >> lower_word_bit_offset;
         if lower_word_bit_offset + bits_per_channel[i] > 32u {
@@ -256,8 +256,8 @@ fn get_meshlet_vertex_position(meshlet: ptr<function, Meshlet>, vertex_id: u32)
 
 // TODO: Load only twice, instead of 3x in cases where you load 3 indices per thread?
 fn get_meshlet_vertex_id(index_id: u32) -> u32 {
-    let packed_index = meshlet_indices[index_id / 4u];
-    let bit_offset = (index_id % 4u) * 8u;
+    let packed_index = meshlet_indices[index_id >> 2u];
+    let bit_offset = (index_id & 3u) * 8u;
     return extractBits(packed_index, bit_offset, 8u);
 }
 
@@ -271,7 +271,7 @@ fn get_meshlet_vertex_position(meshlet: ptr<function, Meshlet>, vertex_id: u32)
     // Read each vertex channel from the bitstream
     var vertex_position_packed = vec3(0u);
     for (var i = 0u; i < 3u; i++) {
-        let lower_word_index = start_bit / 32u;
+        let lower_word_index = start_bit >> 5u;
         let lower_word_bit_offset = start_bit & 31u;
         var next_32_bits = meshlet_vertex_positions[lower_word_index] >> lower_word_bit_offset;
         if lower_word_bit_offset + bits_per_channel[i] > 32u {
diff --git a/crates/bevy_pbr/src/meshlet/meshlet_mesh_material.wgsl b/crates/bevy_pbr/src/meshlet/meshlet_mesh_material.wgsl
index 4b114cbfc8fab..e574b64c717f4 100644
--- a/crates/bevy_pbr/src/meshlet/meshlet_mesh_material.wgsl
+++ b/crates/bevy_pbr/src/meshlet/meshlet_mesh_material.wgsl
@@ -8,8 +8,8 @@
 
 @vertex
 fn vertex(@builtin(vertex_index) vertex_input: u32) -> @builtin(position) vec4<f32> {
-    let vertex_index = vertex_input % 3u;
     let material_id = vertex_input / 3u;
+    let vertex_index = vertex_input - material_id * 3u;
     let material_depth = f32(material_id) / 65535.0;
     let uv = vec2<f32>(vec2(vertex_index >> 1u, vertex_index & 1u)) * 2.0;
     return vec4(uv_to_ndc(uv), material_depth, 1.0);
diff --git a/crates/bevy_pbr/src/render/clustered_forward.wgsl b/crates/bevy_pbr/src/render/clustered_forward.wgsl
index aa3fb4f199b1f..15a8050b1f10a 100644
--- a/crates/bevy_pbr/src/render/clustered_forward.wgsl
+++ b/crates/bevy_pbr/src/render/clustered_forward.wgsl
@@ -129,9 +129,9 @@ fn get_clusterable_object_id(index: u32) -> u32 {
     // The index is correct but in clusterable_object_index_lists we pack 4 u8s into a u32
     // This means the index into clusterable_object_index_lists is index / 4
     let indices = bindings::clusterable_object_index_lists.data[index >> 4u][(index >> 2u) &
-        ((1u << 2u) - 1u)];
+        3u];
     // And index % 4 gives the sub-index of the u8 within the u32 so we shift by 8 * sub-index
-    return (indices >> (8u * (index & ((1u << 2u) - 1u)))) & ((1u << 8u) - 1u);
+    return (indices >> (8u * (index & 3u))) & 255u;
 #endif
 }
 
@@ -151,7 +151,7 @@ fn cluster_debug_visualization(
     var z_slice: u32 = view_z_to_z_slice(view_z, is_orthographic);
     // A hack to make the colors alternate a bit more
     if (z_slice & 1u) == 1u {
-        z_slice = z_slice + bindings::lights.cluster_dimensions.z / 2u;
+        z_slice = z_slice + bindings::lights.cluster_dimensions.z >> 1u;
     }
     let slice_color_hsv = vec3(
         f32(z_slice) / f32(bindings::lights.cluster_dimensions.z + 1u) * PI_2,
diff --git a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
index 543b328aaafd5..eafd7a058c921 100644
--- a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
+++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
@@ -303,7 +303,7 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
         // this frame.
         let output_work_item_index = atomicAdd(&late_preprocess_work_item_indirect_parameters[
             push_constants.late_preprocess_work_item_indirect_offset].work_item_count, 1u);
-        if (output_work_item_index % 64u == 0u) {
+        if ((output_work_item_index & 63u) == 0u) {
             // Our workgroup size is 64, and the indirect parameters for the
             // late mesh preprocessing phase are counted in workgroups, so if
             // we're the first thread in this workgroup, bump the workgroup
diff --git a/crates/bevy_pbr/src/render/morph.wgsl b/crates/bevy_pbr/src/render/morph.wgsl
index 6689d68cc6415..bdb735946b901 100644
--- a/crates/bevy_pbr/src/render/morph.wgsl
+++ b/crates/bevy_pbr/src/render/morph.wgsl
@@ -28,11 +28,11 @@ fn component_texture_coord(vertex_index: u32, component_offset: u32) -> vec2<u32
 }
 fn weight_at(weight_index: u32) -> f32 {
     let i = weight_index;
-    return morph_weights.weights[i / 4u][i % 4u];
+    return morph_weights.weights[i >> 2u][i & 3u];
 }
 fn prev_weight_at(weight_index: u32) -> f32 {
     let i = weight_index;
-    return prev_morph_weights.weights[i / 4u][i % 4u];
+    return prev_morph_weights.weights[i >> 2u][i & 3u];
 }
 fn morph_pixel(vertex: u32, component: u32, weight: u32) -> f32 {
     let coord = component_texture_coord(vertex, component);
diff --git a/crates/bevy_pbr/src/render/pbr_functions.wgsl b/crates/bevy_pbr/src/render/pbr_functions.wgsl
index 84f7b956616c4..b8ad64722e4b9 100644
--- a/crates/bevy_pbr/src/render/pbr_functions.wgsl
+++ b/crates/bevy_pbr/src/render/pbr_functions.wgsl
@@ -85,7 +85,7 @@ fn visibility_range_dither(frag_coord: vec4<f32>, dither: i32) {
     }
 
     // Otherwise, check the dither pattern.
-    let coords = vec2<u32>(floor(frag_coord.xy)) % 4u;
+    let coords = vec2<u32>(floor(frag_coord.xy)) & 3u;
     let threshold = i32((DITHER_THRESHOLD_MAP[coords.y] >> (coords.x * 8)) & 0xff);
     if ((dither >= 0 && dither + threshold >= 16) || (dither < 0 && 1 + dither + threshold <= 0)) {
         discard;
diff --git a/crates/bevy_pbr/src/render/pbr_transmission.wgsl b/crates/bevy_pbr/src/render/pbr_transmission.wgsl
index 720a42bca9631..e7d262c59f993 100644
--- a/crates/bevy_pbr/src/render/pbr_transmission.wgsl
+++ b/crates/bevy_pbr/src/render/pbr_transmission.wgsl
@@ -112,10 +112,10 @@ fn fetch_transmissive_background(offset_position: vec2<f32>, frag_coord: vec3<f3
     let pixel_checkboard = (
 #ifdef TEMPORAL_JITTER
         // 0 or 1 on even/odd pixels, alternates every frame
-        (i32(frag_coord.x) + i32(frag_coord.y) + i32(view_bindings::globals.frame_count)) % 2
+        (i32(frag_coord.x) + i32(frag_coord.y) + i32(view_bindings::globals.frame_count)) & 1
 #else
         // 0 or 1 on even/odd pixels
-        (i32(frag_coord.x) + i32(frag_coord.y)) % 2
+        (i32(frag_coord.x) + i32(frag_coord.y)) & 1
 #endif
     );
 
diff --git a/crates/bevy_pbr/src/render/utils.wgsl b/crates/bevy_pbr/src/render/utils.wgsl
index c887e3005e707..5885ccb0417fc 100644
--- a/crates/bevy_pbr/src/render/utils.wgsl
+++ b/crates/bevy_pbr/src/render/utils.wgsl
@@ -70,7 +70,7 @@ fn octahedral_decode_signed(v: vec2<f32>) -> vec3<f32> {
 
 // https://blog.demofox.org/2022/01/01/interleaved-gradient-noise-a-different-kind-of-low-discrepancy-sequence
 fn interleaved_gradient_noise(pixel_coordinates: vec2<f32>, frame: u32) -> f32 {
-    let xy = pixel_coordinates + 5.588238 * f32(frame % 64u);
+    let xy = pixel_coordinates + 5.588238 * f32(frame & 63u);
     return fract(52.9829189 * fract(0.06711056 * xy.x + 0.00583715 * xy.y));
 }
 
diff --git a/crates/bevy_pbr/src/ssao/preprocess_depth.wgsl b/crates/bevy_pbr/src/ssao/preprocess_depth.wgsl
index a386b09d9c23c..11b7d5ca49b3a 100644
--- a/crates/bevy_pbr/src/ssao/preprocess_depth.wgsl
+++ b/crates/bevy_pbr/src/ssao/preprocess_depth.wgsl
@@ -65,38 +65,38 @@ fn preprocess_depth(@builtin(global_invocation_id) global_id: vec3<u32>, @builti
     workgroupBarrier();
 
     // MIP 2 - Weighted average of MIP 1's depth values (per invocation, 4x4 invocations per workgroup)
-    if all(local_id.xy % vec2<u32>(2u) == vec2<u32>(0u)) {
+    if all((local_id.xy & vec2<u32>(1u)) == vec2<u32>(0u)) {
         let depth0 = previous_mip_depth[local_id.x + 0u][local_id.y + 0u];
         let depth1 = previous_mip_depth[local_id.x + 1u][local_id.y + 0u];
         let depth2 = previous_mip_depth[local_id.x + 0u][local_id.y + 1u];
         let depth3 = previous_mip_depth[local_id.x + 1u][local_id.y + 1u];
         let depth_mip2 = weighted_average(depth0, depth1, depth2, depth3);
-        textureStore(preprocessed_depth_mip2, base_coordinates / 2i, vec4<f32>(depth_mip2, 0.0, 0.0, 0.0));
+        textureStore(preprocessed_depth_mip2, base_coordinates >> vec2<u32>(1u), vec4<f32>(depth_mip2, 0.0, 0.0, 0.0));
         previous_mip_depth[local_id.x][local_id.y] = depth_mip2;
     }
 
     workgroupBarrier();
 
     // MIP 3 - Weighted average of MIP 2's depth values (per invocation, 2x2 invocations per workgroup)
-    if all(local_id.xy % vec2<u32>(4u) == vec2<u32>(0u)) {
+    if all((local_id.xy & vec2<u32>(3u)) == vec2<u32>(0u)) {
         let depth0 = previous_mip_depth[local_id.x + 0u][local_id.y + 0u];
         let depth1 = previous_mip_depth[local_id.x + 2u][local_id.y + 0u];
         let depth2 = previous_mip_depth[local_id.x + 0u][local_id.y + 2u];
         let depth3 = previous_mip_depth[local_id.x + 2u][local_id.y + 2u];
         let depth_mip3 = weighted_average(depth0, depth1, depth2, depth3);
-        textureStore(preprocessed_depth_mip3, base_coordinates / 4i, vec4<f32>(depth_mip3, 0.0, 0.0, 0.0));
+        textureStore(preprocessed_depth_mip3, base_coordinates >> vec2<u32>(2u), vec4<f32>(depth_mip3, 0.0, 0.0, 0.0));
         previous_mip_depth[local_id.x][local_id.y] = depth_mip3;
     }
 
     workgroupBarrier();
 
     // MIP 4 - Weighted average of MIP 3's depth values (per invocation, 1 invocation per workgroup)
-    if all(local_id.xy % vec2<u32>(8u) == vec2<u32>(0u)) {
+    if all((local_id.xy & vec2<u32>(7u)) == vec2<u32>(0u)) {
         let depth0 = previous_mip_depth[local_id.x + 0u][local_id.y + 0u];
         let depth1 = previous_mip_depth[local_id.x + 4u][local_id.y + 0u];
         let depth2 = previous_mip_depth[local_id.x + 0u][local_id.y + 4u];
         let depth3 = previous_mip_depth[local_id.x + 4u][local_id.y + 4u];
         let depth_mip4 = weighted_average(depth0, depth1, depth2, depth3);
-        textureStore(preprocessed_depth_mip4, base_coordinates / 8i, vec4<f32>(depth_mip4, 0.0, 0.0, 0.0));
+        textureStore(preprocessed_depth_mip4, base_coordinates >> vec2<u32>(3u), vec4<f32>(depth_mip4, 0.0, 0.0, 0.0));
     }
 }
diff --git a/crates/bevy_pbr/src/ssao/ssao.wgsl b/crates/bevy_pbr/src/ssao/ssao.wgsl
index ac64d5653f7a4..47d08412f4f95 100644
--- a/crates/bevy_pbr/src/ssao/ssao.wgsl
+++ b/crates/bevy_pbr/src/ssao/ssao.wgsl
@@ -30,10 +30,10 @@
 @group(1) @binding(2) var<uniform> view: View;
 
 fn load_noise(pixel_coordinates: vec2<i32>) -> vec2<f32> {
-    var index = textureLoad(hilbert_index_lut, pixel_coordinates % 64, 0).r;
+    var index = textureLoad(hilbert_index_lut, pixel_coordinates & vec2<i32>(63), 0).r;
 
 #ifdef TEMPORAL_JITTER
-    index += 288u * (globals.frame_count % 64u);
+    index += 288u * (globals.frame_count & 63u);
 #endif
 
     // R2 sequence - http://extremelearning.com.au/unreasonable-effectiveness-of-quasirandom-sequences
diff --git a/crates/bevy_pbr/src/volumetric_fog/volumetric_fog.wgsl b/crates/bevy_pbr/src/volumetric_fog/volumetric_fog.wgsl
index 43e3fc9278b81..5f797d2f357ba 100644
--- a/crates/bevy_pbr/src/volumetric_fog/volumetric_fog.wgsl
+++ b/crates/bevy_pbr/src/volumetric_fog/volumetric_fog.wgsl
@@ -103,6 +103,13 @@ fn henyey_greenstein(neg_LdotV: f32) -> f32 {
     return FRAC_4_PI * (1.0 - g * g) / (denom * sqrt(denom));
 }
 
+fn simple_wrap_3(index: i32) -> i32 {
+    if (index >= 3) {
+        return index - 3;
+    }
+    return index;
+}
+
 @fragment
 fn fragment(@builtin(position) position: vec4<f32>) -> @location(0) vec4<f32> {
     // Unpack the `volumetric_fog` settings.
@@ -140,8 +147,8 @@ fn fragment(@builtin(position) position: vec4<f32>) -> @location(0) vec4<f32> {
     var end_depth_view = 0.0;
     for (var plane_index = 0; plane_index < 3; plane_index += 1) {
         let plane = volumetric_fog.far_planes[plane_index];
-        let other_plane_a = volumetric_fog.far_planes[(plane_index + 1) % 3];
-        let other_plane_b = volumetric_fog.far_planes[(plane_index + 2) % 3];
+        let other_plane_a = volumetric_fog.far_planes[simple_wrap_3(plane_index + 1)];
+        let other_plane_b = volumetric_fog.far_planes[simple_wrap_3(plane_index + 2)];
 
         // Calculate the intersection of the ray and the plane. The ray must
         // intersect in front of us (t > 0).
diff --git a/crates/bevy_sprite/src/tilemap_chunk/tilemap_chunk_material.wgsl b/crates/bevy_sprite/src/tilemap_chunk/tilemap_chunk_material.wgsl
index 7424995e22954..ed45c1ddd0156 100644
--- a/crates/bevy_sprite/src/tilemap_chunk/tilemap_chunk_material.wgsl
+++ b/crates/bevy_sprite/src/tilemap_chunk/tilemap_chunk_material.wgsl
@@ -32,7 +32,7 @@ fn vertex(vertex: Vertex) -> VertexOutput {
 
     out.position = mesh_functions::mesh2d_position_world_to_clip(world_position);
     out.uv = vertex.uv;
-    out.tile_index = vertex.vertex_index / 4u;
+    out.tile_index = vertex.vertex_index >> 2u;
 
     return out;
 }