Use storage buffers for clustered forward point lights (#3989)

# Objective - Make use of storage buffers, where they are available, for clustered forward bindings to support far more point lights in a scene - Fixes #3605 - Based on top of #4079 This branch on an M1 Max can keep 60fps with about 2150 point lights of radius 1m in the Sponza scene where I've been testing. The bottleneck is mostly assigning lights to clusters which grows faster than linearly (I think 1000 lights was about 1.5ms and 5000 was 7.5ms). I have seen papers and presentations leveraging compute shaders that can get this up to over 1 million. That said, I think any further optimisations should probably be done in a separate PR. ## Solution - Add `RenderDevice` to the `Material` and `SpecializedMaterial` trait `::key()` functions to allow setting flags on the keys depending on feature/limit availability - Make `GpuPointLights` and `ViewClusterBuffers` into enums containing `UniformVec` and `StorageBuffer` variants. Implement the necessary API on them to make usage the same for both cases, and the only difference is at initialisation time. - Appropriate shader defs in the shader code to handle the two cases ## Context on some decisions / open questions - I'm using `max_storage_buffers_per_shader_stage >= 3` as a check to see if storage buffers are supported. I was thinking about diving into 'binding resource management' but it feels like we don't have enough use cases to understand the problem yet, and it is mostly a separate concern to this PR, so I think it should be handled separately. - Should `ViewClusterBuffers` and `ViewClusterBindings` be merged, duplicating the count variables into the enum variants? Co-authored-by: Carter Anderson <mcanders1@gmail.com>
2022-04-07 16:16:35 +00:00 · 2022-04-07 16:16:35 +00:00 · c5963b4fd5
commit c5963b4fd5
parent 579928e8e0
16 changed files with 622 additions and 132 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -612,7 +612,10 @@ min_sdk_version = 16
 target_sdk_version = 29
 # Stress Tests
 [[example]]
 name = "many_lights"
 path = "examples/stress_tests/many_lights.rs"
 [[example]]
 name = "transform_hierarchy"
 path = "examples/stress_tests/transform_hierarchy.rs"
--- a/crates/bevy_pbr/src/lib.rs
+++ b/crates/bevy_pbr/src/lib.rs
@ -150,12 +150,10 @@ impl Plugin for PbrPlugin {
            )
            .add_system_to_stage(
                RenderStage::Prepare,
-                // this is added as an exclusive system because it contributes new views. it must run (and have Commands applied)
+                // NOTE: This needs to run after prepare_lights. As prepare_lights is an exclusive system,
-                // _before_ the `prepare_views()` system is run. ideally this becomes a normal system when "stageless" features come out
+                // just adding it to the non-exclusive systems in the Prepare stage means it runs after
-                render::prepare_clusters
+                // prepare_lights.
-                    .exclusive_system()
+                render::prepare_clusters.label(RenderLightSystems::PrepareClusters),
                    .label(RenderLightSystems::PrepareClusters)
                    .after(RenderLightSystems::PrepareLights),
            )
            .add_system_to_stage(
                RenderStage::Queue,
--- a/crates/bevy_pbr/src/light.rs
+++ b/crates/bevy_pbr/src/light.rs
@ -9,6 +9,8 @@ use bevy_render::{
    color::Color,
    prelude::Image,
    primitives::{Aabb, CubemapFrusta, Frustum, Sphere},
    render_resource::BufferBindingType,
    renderer::RenderDevice,
    view::{ComputedVisibility, RenderLayers, Visibility, VisibleEntities},
 };
 use bevy_transform::components::GlobalTransform;
@ -17,7 +19,8 @@ use bevy_window::Windows;
 use crate::{
    calculate_cluster_factors, CubeMapFace, CubemapVisibleEntities, ViewClusterBindings,
-    CUBE_MAP_FACES, MAX_POINT_LIGHTS, POINT_LIGHT_NEAR_Z,
+    CLUSTERED_FORWARD_STORAGE_BUFFER_COUNT, CUBE_MAP_FACES, MAX_UNIFORM_BUFFER_POINT_LIGHTS,
    POINT_LIGHT_NEAR_Z,
 };
 /// A light that emits light in all directions from a central point.
@ -709,6 +712,7 @@ pub(crate) fn assign_lights_to_clusters(
    lights_query: Query<(Entity, &GlobalTransform, &PointLight, &Visibility)>,
    mut lights: Local<Vec<PointLightAssignmentData>>,
    mut max_point_lights_warning_emitted: Local<bool>,
    render_device: Res<RenderDevice>,
 ) {
    global_lights.entities.clear();
    lights.clear();
@ -727,7 +731,13 @@ pub(crate) fn assign_lights_to_clusters(
            ),
    );
-    if lights.len() > MAX_POINT_LIGHTS {
+    let clustered_forward_buffer_binding_type =
        render_device.get_supported_read_only_binding_type(CLUSTERED_FORWARD_STORAGE_BUFFER_COUNT);
    let supports_storage_buffers = matches!(
        clustered_forward_buffer_binding_type,
        BufferBindingType::Storage { .. }
    );
    if lights.len() > MAX_UNIFORM_BUFFER_POINT_LIGHTS && !supports_storage_buffers {
        lights.sort_by(|light_1, light_2| {
            point_light_order(
                (&light_1.entity, &light_1.shadows_enabled),
@ -743,7 +753,7 @@ pub(crate) fn assign_lights_to_clusters(
        let mut lights_in_view_count = 0;
        lights.retain(|light| {
            // take one extra light to check if we should emit the warning
-            if lights_in_view_count == MAX_POINT_LIGHTS + 1 {
+            if lights_in_view_count == MAX_UNIFORM_BUFFER_POINT_LIGHTS + 1 {
                false
            } else {
                let light_sphere = Sphere {
@ -763,12 +773,15 @@ pub(crate) fn assign_lights_to_clusters(
            }
        });
-        if lights.len() > MAX_POINT_LIGHTS && !*max_point_lights_warning_emitted {
+        if lights.len() > MAX_UNIFORM_BUFFER_POINT_LIGHTS && !*max_point_lights_warning_emitted {
-            warn!("MAX_POINT_LIGHTS ({}) exceeded", MAX_POINT_LIGHTS);
+            warn!(
                "MAX_UNIFORM_BUFFER_POINT_LIGHTS ({}) exceeded",
                MAX_UNIFORM_BUFFER_POINT_LIGHTS
            );
            *max_point_lights_warning_emitted = true;
        }
-        lights.truncate(MAX_POINT_LIGHTS);
+        lights.truncate(MAX_UNIFORM_BUFFER_POINT_LIGHTS);
    }
    for (view_entity, camera_transform, camera, frustum, config, clusters, mut visible_lights) in
--- a/crates/bevy_pbr/src/material.rs
+++ b/crates/bevy_pbr/src/material.rs
@ -39,7 +39,7 @@ use std::marker::PhantomData;
 /// way to render [`Mesh`] entities with custom shader logic. For materials that can specialize their [`RenderPipelineDescriptor`]
 /// based on specific material values, see [`SpecializedMaterial`]. [`Material`] automatically implements [`SpecializedMaterial`]
 /// and can be used anywhere that type is used (such as [`MaterialPlugin`]).
-pub trait Material: Asset + RenderAsset {
+pub trait Material: Asset + RenderAsset + Sized {
    /// Returns this material's [`BindGroup`]. This should match the layout returned by [`Material::bind_group_layout`].
    fn bind_group(material: &<Self as RenderAsset>::PreparedAsset) -> &BindGroup;
@ -78,6 +78,7 @@ pub trait Material: Asset + RenderAsset {
    #[allow(unused_variables)]
    #[inline]
    fn specialize(
        pipeline: &MaterialPipeline<Self>,
        descriptor: &mut RenderPipelineDescriptor,
        layout: &MeshVertexBufferLayout,
    ) -> Result<(), SpecializedMeshPipelineError> {
@ -93,11 +94,12 @@ impl<M: Material> SpecializedMaterial for M {
    #[inline]
    fn specialize(
        pipeline: &MaterialPipeline<Self>,
        descriptor: &mut RenderPipelineDescriptor,
        _key: Self::Key,
        layout: &MeshVertexBufferLayout,
    ) -> Result<(), SpecializedMeshPipelineError> {
-        <M as Material>::specialize(descriptor, layout)
+        <M as Material>::specialize(pipeline, descriptor, layout)
    }
    #[inline]
@ -137,7 +139,7 @@ impl<M: Material> SpecializedMaterial for M {
 /// way to render [`Mesh`] entities with custom shader logic. [`SpecializedMaterials`](SpecializedMaterial) use their [`SpecializedMaterial::Key`]
 /// to customize their [`RenderPipelineDescriptor`] based on specific material values. The slightly simpler [`Material`] trait
 /// should be used for materials that do not need specialization. [`Material`] types automatically implement [`SpecializedMaterial`].
-pub trait SpecializedMaterial: Asset + RenderAsset {
+pub trait SpecializedMaterial: Asset + RenderAsset + Sized {
    /// The key used to specialize this material's [`RenderPipelineDescriptor`].
    type Key: PartialEq + Eq + Hash + Clone + Send + Sync;
@ -148,6 +150,7 @@ pub trait SpecializedMaterial: Asset + RenderAsset {
    /// Specializes the given `descriptor` according to the given `key`.
    fn specialize(
        pipeline: &MaterialPipeline<Self>,
        descriptor: &mut RenderPipelineDescriptor,
        key: Self::Key,
        layout: &MeshVertexBufferLayout,
@ -251,7 +254,7 @@ impl<M: SpecializedMaterial> SpecializedMeshPipeline for MaterialPipeline<M> {
        let descriptor_layout = descriptor.layout.as_mut().unwrap();
        descriptor_layout.insert(1, self.material_layout.clone());
-        M::specialize(&mut descriptor, key.material_key, layout)?;
+        M::specialize(self, &mut descriptor, key.material_key, layout)?;
        Ok(descriptor)
    }
 }
--- a/crates/bevy_pbr/src/pbr_material.rs
+++ b/crates/bevy_pbr/src/pbr_material.rs
@ -378,6 +378,7 @@ impl SpecializedMaterial for StandardMaterial {
    }
    fn specialize(
        _pipeline: &MaterialPipeline<Self>,
        descriptor: &mut RenderPipelineDescriptor,
        key: Self::Key,
        _layout: &MeshVertexBufferLayout,
--- a/crates/bevy_pbr/src/render/light.rs
+++ b/crates/bevy_pbr/src/render/light.rs
@ -10,7 +10,7 @@ use bevy_ecs::{
    prelude::*,
    system::{lifetimeless::*, SystemParamItem},
 };
-use bevy_math::{const_vec3, Mat4, UVec3, UVec4, Vec2, Vec3, Vec4, Vec4Swizzles};
+use bevy_math::{const_vec3, Mat4, UVec2, UVec3, UVec4, Vec2, Vec3, Vec4, Vec4Swizzles};
 use bevy_render::{
    camera::{Camera, CameraProjection},
    color::Color,
@ -22,7 +22,7 @@ use bevy_render::{
        EntityRenderCommand, PhaseItem, RenderCommandResult, RenderPhase, SetItemPipeline,
        TrackedRenderPass,
    },
-    render_resource::{std140::AsStd140, *},
+    render_resource::{std140::AsStd140, std430::AsStd430, *},
    renderer::{RenderContext, RenderDevice, RenderQueue},
    texture::*,
    view::{
@ -81,7 +81,7 @@ pub struct ExtractedDirectionalLight {
 pub type ExtractedDirectionalLightShadowMap = DirectionalLightShadowMap;
 #[repr(C)]
-#[derive(Copy, Clone, AsStd140, Default, Debug)]
+#[derive(Copy, Clone, AsStd140, AsStd430, Default, Debug)]
 pub struct GpuPointLight {
    // The lower-right 2x2 values of the projection matrix 22 23 32 33
    projection_lr: Vec4,
@ -92,9 +92,84 @@ pub struct GpuPointLight {
    shadow_normal_bias: f32,
 }
-#[derive(AsStd140)]
+pub enum GpuPointLights {
-pub struct GpuPointLights {
+    Uniform {
-    data: [GpuPointLight; MAX_POINT_LIGHTS],
+        buffer: UniformVec<[GpuPointLight; MAX_UNIFORM_BUFFER_POINT_LIGHTS]>,
    },
    Storage {
        buffer: StorageBuffer<GpuPointLight>,
    },
 }
 impl GpuPointLights {
    fn new(buffer_binding_type: BufferBindingType) -> Self {
        match buffer_binding_type {
            BufferBindingType::Storage { .. } => Self::storage(),
            BufferBindingType::Uniform => Self::uniform(),
        }
    }
    fn uniform() -> Self {
        Self::Uniform {
            buffer: UniformVec::default(),
        }
    }
    fn storage() -> Self {
        Self::Storage {
            buffer: StorageBuffer::default(),
        }
    }
    fn clear(&mut self) {
        match self {
            GpuPointLights::Uniform { buffer } => buffer.clear(),
            GpuPointLights::Storage { buffer } => buffer.clear(),
        }
    }
    fn push(&mut self, mut lights: Vec<GpuPointLight>) {
        match self {
            GpuPointLights::Uniform { buffer } => {
                // NOTE: This iterator construction allows moving and padding with default
                // values and is like this to avoid unnecessary cloning.
                let gpu_point_lights = lights
                    .drain(..)
                    .chain(std::iter::repeat_with(GpuPointLight::default))
                    .take(MAX_UNIFORM_BUFFER_POINT_LIGHTS)
                    .collect::<Vec<_>>();
                buffer.push(gpu_point_lights.try_into().unwrap());
            }
            GpuPointLights::Storage { buffer } => {
                buffer.append(&mut lights);
            }
        }
    }
    fn write_buffer(&mut self, render_device: &RenderDevice, render_queue: &RenderQueue) {
        match self {
            GpuPointLights::Uniform { buffer } => buffer.write_buffer(render_device, render_queue),
            GpuPointLights::Storage { buffer } => buffer.write_buffer(render_device, render_queue),
        }
    }
    pub fn binding(&self) -> Option<BindingResource> {
        match self {
            GpuPointLights::Uniform { buffer } => buffer.binding(),
            GpuPointLights::Storage { buffer } => buffer.binding(),
        }
    }
    pub fn len(&self) -> usize {
        match self {
            GpuPointLights::Uniform { buffer } => buffer.len(),
            GpuPointLights::Storage { buffer } => buffer.values().len(),
        }
    }
    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }
 }
 // NOTE: These must match the bit flags in bevy_pbr2/src/render/pbr.frag!
@ -144,7 +219,7 @@ pub struct GpuLights {
 }
 // NOTE: this must be kept in sync with the same constants in pbr.frag
-pub const MAX_POINT_LIGHTS: usize = 256;
+pub const MAX_UNIFORM_BUFFER_POINT_LIGHTS: usize = 256;
 // FIXME: How should we handle shadows for clustered forward? Limiting to maximum 10
 // point light shadow maps for now
 #[cfg(feature = "webgl")]
@ -346,13 +421,13 @@ pub fn extract_clusters(mut commands: Commands, views: Query<(Entity, &Clusters)
    }
 }
 #[allow(clippy::too_many_arguments)]
 pub fn extract_lights(
    mut commands: Commands,
    ambient_light: Res<AmbientLight>,
    point_light_shadow_map: Res<PointLightShadowMap>,
    directional_light_shadow_map: Res<DirectionalLightShadowMap>,
    global_point_lights: Res<GlobalVisiblePointLights>,
    // visible_point_lights: Query<&VisiblePointLights>,
    mut point_lights: Query<(&PointLight, &mut CubemapVisibleEntities, &GlobalTransform)>,
    mut directional_lights: Query<(
        Entity,
@ -361,6 +436,7 @@ pub fn extract_lights(
        &GlobalTransform,
        &Visibility,
    )>,
    mut previous_point_lights_len: Local<usize>,
 ) {
    commands.insert_resource(ExtractedAmbientLight {
        color: ambient_light.color,
@ -379,32 +455,38 @@ pub fn extract_lights(
    // https://catlikecoding.com/unity/tutorials/custom-srp/point-and-spot-shadows/
    let point_light_texel_size = 2.0 / point_light_shadow_map.size as f32;
    let mut point_lights_values = Vec::with_capacity(*previous_point_lights_len);
    for entity in global_point_lights.iter().copied() {
        if let Ok((point_light, cubemap_visible_entities, transform)) = point_lights.get_mut(entity)
        {
            let render_cubemap_visible_entities =
                std::mem::take(cubemap_visible_entities.into_inner());
-            commands.get_or_spawn(entity).insert_bundle((
+            point_lights_values.push((
-                ExtractedPointLight {
+                entity,
-                    color: point_light.color,
+                (
-                    // NOTE: Map from luminous power in lumens to luminous intensity in lumens per steradian
+                    ExtractedPointLight {
-                    // for a point light. See https://google.github.io/filament/Filament.html#mjx-eqn-pointLightLuminousPower
+                        color: point_light.color,
-                    // for details.
+                        // NOTE: Map from luminous power in lumens to luminous intensity in lumens per steradian
-                    intensity: point_light.intensity / (4.0 * std::f32::consts::PI),
+                        // for a point light. See https://google.github.io/filament/Filament.html#mjx-eqn-pointLightLuminousPower
-                    range: point_light.range,
+                        // for details.
-                    radius: point_light.radius,
+                        intensity: point_light.intensity / (4.0 * std::f32::consts::PI),
-                    transform: *transform,
+                        range: point_light.range,
-                    shadows_enabled: point_light.shadows_enabled,
+                        radius: point_light.radius,
-                    shadow_depth_bias: point_light.shadow_depth_bias,
+                        transform: *transform,
-                    // The factor of SQRT_2 is for the worst-case diagonal offset
+                        shadows_enabled: point_light.shadows_enabled,
-                    shadow_normal_bias: point_light.shadow_normal_bias
+                        shadow_depth_bias: point_light.shadow_depth_bias,
-                        * point_light_texel_size
+                        // The factor of SQRT_2 is for the worst-case diagonal offset
-                        * std::f32::consts::SQRT_2,
+                        shadow_normal_bias: point_light.shadow_normal_bias
-                },
+                            * point_light_texel_size
-                render_cubemap_visible_entities,
+                            * std::f32::consts::SQRT_2,
                    },
                    render_cubemap_visible_entities,
                ),
            ));
        }
    }
    *previous_point_lights_len = point_lights_values.len();
    commands.insert_or_spawn_batch(point_lights_values);
    for (entity, directional_light, visible_entities, transform, visibility) in
        directional_lights.iter_mut()
@ -528,12 +610,34 @@ pub struct ViewLightsUniformOffset {
    pub offset: u32,
 }
-#[derive(Default)]
+// NOTE: Clustered-forward rendering requires 3 storage buffer bindings so check that
 // at least that many are supported using this constant and SupportedBindingType::from_device()
 pub const CLUSTERED_FORWARD_STORAGE_BUFFER_COUNT: u32 = 3;
 pub struct GlobalLightMeta {
-    pub gpu_point_lights: UniformVec<GpuPointLights>,
+    pub gpu_point_lights: GpuPointLights,
    pub entity_to_index: HashMap<Entity, usize>,
 }
 impl FromWorld for GlobalLightMeta {
    fn from_world(world: &mut World) -> Self {
        Self::new(
            world
                .resource::<RenderDevice>()
                .get_supported_read_only_binding_type(CLUSTERED_FORWARD_STORAGE_BUFFER_COUNT),
        )
    }
 }
 impl GlobalLightMeta {
    pub fn new(buffer_binding_type: BufferBindingType) -> Self {
        Self {
            gpu_point_lights: GpuPointLights::new(buffer_binding_type),
            entity_to_index: HashMap::default(),
        }
    }
 }
 #[derive(Default)]
 pub struct LightMeta {
    pub view_gpu_lights: DynamicUniformVec<GpuLights>,
@ -615,14 +719,14 @@ pub fn prepare_lights(
            .reserve(point_lights.len());
    }
-    let mut gpu_point_lights = [GpuPointLight::default(); MAX_POINT_LIGHTS];
+    let mut gpu_point_lights = Vec::new();
    for (index, &(entity, light)) in point_lights.iter().enumerate() {
        let mut flags = PointLightFlags::NONE;
        // Lights are sorted, shadow enabled lights are first
        if light.shadows_enabled && index < MAX_POINT_LIGHT_SHADOW_MAPS {
            flags |= PointLightFlags::SHADOWS_ENABLED;
        }
-        gpu_point_lights[index] = GpuPointLight {
+        gpu_point_lights.push(GpuPointLight {
            projection_lr: Vec4::new(
                cube_face_projection.z_axis.z,
                cube_face_projection.z_axis.w,
@ -639,12 +743,10 @@ pub fn prepare_lights(
            flags: flags.bits,
            shadow_depth_bias: light.shadow_depth_bias,
            shadow_normal_bias: light.shadow_normal_bias,
-        };
+        });
        global_light_meta.entity_to_index.insert(entity, index);
    }
-    global_light_meta.gpu_point_lights.push(GpuPointLights {
+    global_light_meta.gpu_point_lights.push(gpu_point_lights);
        data: gpu_point_lights,
    });
    global_light_meta
        .gpu_point_lights
        .write_buffer(&render_device, &render_queue);
@ -906,7 +1008,7 @@ pub fn prepare_lights(
 }
 // this must match CLUSTER_COUNT_SIZE in pbr.wgsl
-// and must be large enough to contain MAX_POINT_LIGHTS
+// and must be large enough to contain MAX_UNIFORM_BUFFER_POINT_LIGHTS
 const CLUSTER_COUNT_SIZE: u32 = 13;
 const CLUSTER_OFFSET_MASK: u32 = (1 << (32 - CLUSTER_COUNT_SIZE)) - 1;
@ -931,14 +1033,47 @@ fn pack_offset_and_count(offset: usize, count: usize) -> u32 {
        | (count as u32 & CLUSTER_COUNT_MASK)
 }
-#[derive(Component, Default)]
+enum ViewClusterBuffers {
    Uniform {
        // NOTE: UVec4 is because all arrays in Std140 layout have 16-byte alignment
        cluster_light_index_lists: UniformVec<[UVec4; ViewClusterBindings::MAX_UNIFORM_ITEMS]>,
        // NOTE: UVec4 is because all arrays in Std140 layout have 16-byte alignment
        cluster_offsets_and_counts: UniformVec<[UVec4; ViewClusterBindings::MAX_UNIFORM_ITEMS]>,
    },
    Storage {
        cluster_light_index_lists: StorageBuffer<u32>,
        cluster_offsets_and_counts: StorageBuffer<UVec2>,
    },
 }
 impl ViewClusterBuffers {
    fn new(buffer_binding_type: BufferBindingType) -> Self {
        match buffer_binding_type {
            BufferBindingType::Storage { .. } => Self::storage(),
            BufferBindingType::Uniform => Self::uniform(),
        }
    }
    fn uniform() -> Self {
        ViewClusterBuffers::Uniform {
            cluster_light_index_lists: UniformVec::default(),
            cluster_offsets_and_counts: UniformVec::default(),
        }
    }
    fn storage() -> Self {
        ViewClusterBuffers::Storage {
            cluster_light_index_lists: StorageBuffer::default(),
            cluster_offsets_and_counts: StorageBuffer::default(),
        }
    }
 }
 #[derive(Component)]
 pub struct ViewClusterBindings {
    n_indices: usize,
    // NOTE: UVec4 is because all arrays in Std140 layout have 16-byte alignment
    pub cluster_light_index_lists: UniformVec<[UVec4; Self::MAX_UNIFORM_ITEMS]>,
    n_offsets: usize,
-    // NOTE: UVec4 is because all arrays in Std140 layout have 16-byte alignment
+    buffers: ViewClusterBuffers,
    pub cluster_offsets_and_counts: UniformVec<[UVec4; Self::MAX_UNIFORM_ITEMS]>,
 }
 impl ViewClusterBindings {
@ -946,25 +1081,59 @@ impl ViewClusterBindings {
    const MAX_UNIFORM_ITEMS: usize = Self::MAX_OFFSETS / 4;
    pub const MAX_INDICES: usize = 16384;
    pub fn new(buffer_binding_type: BufferBindingType) -> Self {
        Self {
            n_indices: 0,
            n_offsets: 0,
            buffers: ViewClusterBuffers::new(buffer_binding_type),
        }
    }
    pub fn reserve_and_clear(&mut self) {
-        self.cluster_light_index_lists.clear();
+        match &mut self.buffers {
-        self.cluster_light_index_lists
+            ViewClusterBuffers::Uniform {
-            .push([UVec4::ZERO; Self::MAX_UNIFORM_ITEMS]);
+                cluster_light_index_lists,
-        self.cluster_offsets_and_counts.clear();
+                cluster_offsets_and_counts,
-        self.cluster_offsets_and_counts
+            } => {
-            .push([UVec4::ZERO; Self::MAX_UNIFORM_ITEMS]);
+                cluster_light_index_lists.clear();
                cluster_light_index_lists.push([UVec4::ZERO; Self::MAX_UNIFORM_ITEMS]);
                cluster_offsets_and_counts.clear();
                cluster_offsets_and_counts.push([UVec4::ZERO; Self::MAX_UNIFORM_ITEMS]);
            }
            ViewClusterBuffers::Storage {
                cluster_light_index_lists,
                cluster_offsets_and_counts,
                ..
            } => {
                cluster_light_index_lists.clear();
                cluster_offsets_and_counts.clear();
            }
        }
    }
    pub fn push_offset_and_count(&mut self, offset: usize, count: usize) {
-        let array_index = self.n_offsets >> 2; // >> 2 is equivalent to / 4
+        match &mut self.buffers {
-        if array_index >= Self::MAX_UNIFORM_ITEMS {
+            ViewClusterBuffers::Uniform {
-            warn!("cluster offset and count out of bounds!");
+                cluster_offsets_and_counts,
-            return;
+                ..
-        }
+            } => {
-        let component = self.n_offsets & ((1 << 2) - 1);
+                let array_index = self.n_offsets >> 2; // >> 2 is equivalent to / 4
-        let packed = pack_offset_and_count(offset, count);
+                if array_index >= Self::MAX_UNIFORM_ITEMS {
                    warn!("cluster offset and count out of bounds!");
                    return;
                }
                let component = self.n_offsets & ((1 << 2) - 1);
                let packed = pack_offset_and_count(offset, count);
-        self.cluster_offsets_and_counts.get_mut(0)[array_index][component] = packed;
+                cluster_offsets_and_counts.get_mut(0)[array_index][component] = packed;
            }
            ViewClusterBuffers::Storage {
                cluster_offsets_and_counts,
                ..
            } => {
                cluster_offsets_and_counts.push(UVec2::new(offset as u32, count as u32));
            }
        }
        self.n_offsets += 1;
    }
@ -974,22 +1143,81 @@ impl ViewClusterBindings {
    }
    pub fn push_index(&mut self, index: usize) {
-        let array_index = self.n_indices >> 4; // >> 4 is equivalent to / 16
+        match &mut self.buffers {
-        let component = (self.n_indices >> 2) & ((1 << 2) - 1);
+            ViewClusterBuffers::Uniform {
-        let sub_index = self.n_indices & ((1 << 2) - 1);
+                cluster_light_index_lists,
-        let index = index as u32 & POINT_LIGHT_INDEX_MASK;
+                ..
            } => {
                let array_index = self.n_indices >> 4; // >> 4 is equivalent to / 16
                let component = (self.n_indices >> 2) & ((1 << 2) - 1);
                let sub_index = self.n_indices & ((1 << 2) - 1);
                let index = index as u32 & POINT_LIGHT_INDEX_MASK;
-        self.cluster_light_index_lists.get_mut(0)[array_index][component] |=
+                cluster_light_index_lists.get_mut(0)[array_index][component] |=
-            index << (8 * sub_index);
+                    index << (8 * sub_index);
            }
            ViewClusterBuffers::Storage {
                cluster_light_index_lists,
                ..
            } => {
                cluster_light_index_lists.push(index as u32);
            }
        }
        self.n_indices += 1;
    }
    pub fn write_buffers(&mut self, render_device: &RenderDevice, render_queue: &RenderQueue) {
        match &mut self.buffers {
            ViewClusterBuffers::Uniform {
                cluster_light_index_lists,
                cluster_offsets_and_counts,
            } => {
                cluster_light_index_lists.write_buffer(render_device, render_queue);
                cluster_offsets_and_counts.write_buffer(render_device, render_queue);
            }
            ViewClusterBuffers::Storage {
                cluster_light_index_lists,
                cluster_offsets_and_counts,
            } => {
                cluster_light_index_lists.write_buffer(render_device, render_queue);
                cluster_offsets_and_counts.write_buffer(render_device, render_queue);
            }
        }
    }
    pub fn light_index_lists_binding(&self) -> Option<BindingResource> {
        match &self.buffers {
            ViewClusterBuffers::Uniform {
                cluster_light_index_lists,
                ..
            } => cluster_light_index_lists.binding(),
            ViewClusterBuffers::Storage {
                cluster_light_index_lists,
                ..
            } => cluster_light_index_lists.binding(),
        }
    }
    pub fn offsets_and_counts_binding(&self) -> Option<BindingResource> {
        match &self.buffers {
            ViewClusterBuffers::Uniform {
                cluster_offsets_and_counts,
                ..
            } => cluster_offsets_and_counts.binding(),
            ViewClusterBuffers::Storage {
                cluster_offsets_and_counts,
                ..
            } => cluster_offsets_and_counts.binding(),
        }
    }
 }
 pub fn prepare_clusters(
    mut commands: Commands,
    render_device: Res<RenderDevice>,
    render_queue: Res<RenderQueue>,
    mesh_pipeline: Res<MeshPipeline>,
    global_light_meta: Res<GlobalLightMeta>,
    views: Query<
        (
@ -1000,8 +1228,14 @@ pub fn prepare_clusters(
        With<RenderPhase<Transparent3d>>,
    >,
 ) {
    let render_device = render_device.into_inner();
    let supports_storage_buffers = matches!(
        mesh_pipeline.clustered_forward_buffer_binding_type,
        BufferBindingType::Storage { .. }
    );
    for (entity, cluster_config, extracted_clusters) in views.iter() {
-        let mut view_clusters_bindings = ViewClusterBindings::default();
+        let mut view_clusters_bindings =
            ViewClusterBindings::new(mesh_pipeline.clustered_forward_buffer_binding_type);
        view_clusters_bindings.reserve_and_clear();
        let mut indices_full = false;
@ -1021,6 +1255,7 @@ pub fn prepare_clusters(
                            {
                                if view_clusters_bindings.n_indices()
                                    >= ViewClusterBindings::MAX_INDICES
                                    && !supports_storage_buffers
                                {
                                    warn!("Cluster light index lists is full! The PointLights in the view are affecting too many clusters.");
                                    indices_full = true;
@ -1036,12 +1271,7 @@ pub fn prepare_clusters(
            }
        }
-        view_clusters_bindings
+        view_clusters_bindings.write_buffers(render_device, &render_queue);
            .cluster_light_index_lists
            .write_buffer(&render_device, &render_queue);
        view_clusters_bindings
            .cluster_offsets_and_counts
            .write_buffer(&render_device, &render_queue);
        commands.get_or_spawn(entity).insert(view_clusters_bindings);
    }
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@ -1,6 +1,7 @@
 use crate::{
    GlobalLightMeta, GpuLights, LightMeta, NotShadowCaster, NotShadowReceiver, ShadowPipeline,
    ViewClusterBindings, ViewLightsUniformOffset, ViewShadowBindings,
    CLUSTERED_FORWARD_STORAGE_BUFFER_COUNT,
 };
 use bevy_app::Plugin;
 use bevy_asset::{load_internal_asset, Assets, Handle, HandleUntyped};
@ -258,11 +259,18 @@ pub struct MeshPipeline {
    pub skinned_mesh_layout: BindGroupLayout,
    // This dummy white texture is to be used in place of optional StandardMaterial textures
    pub dummy_white_gpu_image: GpuImage,
    pub clustered_forward_buffer_binding_type: BufferBindingType,
 }
 impl FromWorld for MeshPipeline {
    fn from_world(world: &mut World) -> Self {
        let render_device = world.resource::<RenderDevice>();
        let clustered_forward_buffer_binding_type = render_device
            .get_supported_read_only_binding_type(CLUSTERED_FORWARD_STORAGE_BUFFER_COUNT);
        let cluster_min_binding_size = match clustered_forward_buffer_binding_type {
            BufferBindingType::Storage { .. } => None,
            BufferBindingType::Uniform => BufferSize::new(16384),
        };
        let view_layout = render_device.create_bind_group_layout(&BindGroupLayoutDescriptor {
            entries: &[
                // View
@ -334,11 +342,12 @@ impl FromWorld for MeshPipeline {
                    binding: 6,
                    visibility: ShaderStages::FRAGMENT,
                    ty: BindingType::Buffer {
-                        ty: BufferBindingType::Uniform,
+                        ty: clustered_forward_buffer_binding_type,
                        has_dynamic_offset: false,
-                        // NOTE: Static size for uniform buffers. GpuPointLight has a padded
+                        // NOTE (when no storage buffers): Static size for uniform buffers.
-                        // size of 64 bytes, so 16384 / 64 = 256 point lights max
+                        // GpuPointLight has a padded size of 64 bytes, so 16384 / 64 = 256
-                        min_binding_size: BufferSize::new(16384),
+                        // point lights max
                        min_binding_size: cluster_min_binding_size,
                    },
                    count: None,
                },
@ -347,10 +356,11 @@ impl FromWorld for MeshPipeline {
                    binding: 7,
                    visibility: ShaderStages::FRAGMENT,
                    ty: BindingType::Buffer {
-                        ty: BufferBindingType::Uniform,
+                        ty: clustered_forward_buffer_binding_type,
                        has_dynamic_offset: false,
-                        // NOTE: With 256 point lights max, indices need 8 bits so use u8
+                        // NOTE (when no storage buffers): With 256 point lights max, indices
-                        min_binding_size: BufferSize::new(16384),
+                        // need 8 bits so use u8
                        min_binding_size: cluster_min_binding_size,
                    },
                    count: None,
                },
@ -359,13 +369,14 @@ impl FromWorld for MeshPipeline {
                    binding: 8,
                    visibility: ShaderStages::FRAGMENT,
                    ty: BindingType::Buffer {
-                        ty: BufferBindingType::Uniform,
+                        ty: clustered_forward_buffer_binding_type,
                        has_dynamic_offset: false,
-                        // NOTE: The offset needs to address 16384 indices, which needs 14 bits.
+                        // NOTE (when no storage buffers): The offset needs to address 16384
-                        // The count can be at most all 256 lights so 8 bits.
+                        // indices, which needs 14 bits. The count can be at most all 256 lights
-                        // Pack the offset into the upper 24 bits and the count into the
+                        // so 8 bits.
-                        // lower 8 bits.
+                        // NOTE: Pack the offset into the upper 19 bits and the count into the
-                        min_binding_size: BufferSize::new(16384),
+                        // lower 13 bits.
                        min_binding_size: cluster_min_binding_size,
                    },
                    count: None,
                },
@ -457,6 +468,7 @@ impl FromWorld for MeshPipeline {
            view_layout,
            mesh_layout,
            skinned_mesh_layout,
            clustered_forward_buffer_binding_type,
            dummy_white_gpu_image,
        }
    }
@ -548,6 +560,18 @@ impl SpecializedMeshPipeline for MeshPipeline {
            vertex_attributes.push(Mesh::ATTRIBUTE_TANGENT.at_shader_location(3));
        }
        // TODO: consider exposing this in shaders in a more generally useful way, such as:
        // # if AVAILABLE_STORAGE_BUFFER_BINDINGS == 3
        // /* use storage buffers here */
        // # elif
        // /* use uniforms here */
        if !matches!(
            self.clustered_forward_buffer_binding_type,
            BufferBindingType::Storage { .. }
        ) {
            shader_defs.push(String::from("NO_STORAGE_BUFFERS_SUPPORT"));
        }
        let mut bind_group_layout = vec![self.view_layout.clone()];
        if layout.contains(Mesh::ATTRIBUTE_JOINT_INDEX)
            && layout.contains(Mesh::ATTRIBUTE_JOINT_WEIGHT)
@ -770,17 +794,11 @@ pub fn queue_mesh_view_bind_groups(
                    },
                    BindGroupEntry {
                        binding: 7,
-                        resource: view_cluster_bindings
+                        resource: view_cluster_bindings.light_index_lists_binding().unwrap(),
                            .cluster_light_index_lists
                            .binding()
                            .unwrap(),
                    },
                    BindGroupEntry {
                        binding: 8,
-                        resource: view_cluster_bindings
+                        resource: view_cluster_bindings.offsets_and_counts_binding().unwrap(),
                            .cluster_offsets_and_counts
                            .binding()
                            .unwrap(),
                    },
                ],
                label: Some("mesh_view_bind_group"),
--- a/crates/bevy_pbr/src/render/mesh_view_bind_group.wgsl
+++ b/crates/bevy_pbr/src/render/mesh_view_bind_group.wgsl
@ -57,20 +57,30 @@ struct Lights {
    n_directional_lights: u32;
 };
 #ifdef NO_STORAGE_BUFFERS_SUPPORT
 struct PointLights {
    data: array<PointLight, 256u>;
 };
 struct ClusterLightIndexLists {
    // each u32 contains 4 u8 indices into the PointLights array
    data: array<vec4<u32>, 1024u>;
 };
 struct ClusterOffsetsAndCounts {
    // each u32 contains a 24-bit index into ClusterLightIndexLists in the high 24 bits
    // and an 8-bit count of the number of lights in the low 8 bits
    data: array<vec4<u32>, 1024u>;
 };
 #else
 struct PointLights {
    data: array<PointLight>;
 };
 struct ClusterLightIndexLists {
    data: array<u32>;
 };
 struct ClusterOffsetsAndCounts {
    data: array<vec2<u32>>;
 };
 #endif
 [[group(0), binding(0)]]
 var<uniform> view: View;
@ -94,9 +104,19 @@ var directional_shadow_textures: texture_depth_2d_array;
 #endif
 [[group(0), binding(5)]]
 var directional_shadow_textures_sampler: sampler_comparison;
 #ifdef NO_STORAGE_BUFFERS_SUPPORT
 [[group(0), binding(6)]]
 var<uniform> point_lights: PointLights;
 [[group(0), binding(7)]]
 var<uniform> cluster_light_index_lists: ClusterLightIndexLists;
 [[group(0), binding(8)]]
 var<uniform> cluster_offsets_and_counts: ClusterOffsetsAndCounts;
 #else
 [[group(0), binding(6)]]
 var<storage> point_lights: PointLights;
 [[group(0), binding(7)]]
 var<storage> cluster_light_index_lists: ClusterLightIndexLists;
 [[group(0), binding(8)]]
 var<storage> cluster_offsets_and_counts: ClusterOffsetsAndCounts;
 #endif
--- a/crates/bevy_pbr/src/render/pbr.wgsl
+++ b/crates/bevy_pbr/src/render/pbr.wgsl
@ -264,29 +264,32 @@ fn fragment_cluster_index(frag_coord: vec2<f32>, view_z: f32, is_orthographic: b
    );
 }
 struct ClusterOffsetAndCount {
    offset: u32;
    count: u32;
 };
 // this must match CLUSTER_COUNT_SIZE in light.rs
 let CLUSTER_COUNT_SIZE = 13u;
-fn unpack_offset_and_count(cluster_index: u32) -> ClusterOffsetAndCount {
+fn unpack_offset_and_count(cluster_index: u32) -> vec2<u32> {
 #ifdef NO_STORAGE_BUFFERS_SUPPORT
    let offset_and_count = cluster_offsets_and_counts.data[cluster_index >> 2u][cluster_index & ((1u << 2u) - 1u)];
-    var output: ClusterOffsetAndCount;
+    return vec2<u32>(
-    // The offset is stored in the upper 24 bits
+        // The offset is stored in the upper 32 - CLUSTER_COUNT_SIZE = 19 bits
-    output.offset = (offset_and_count >> CLUSTER_COUNT_SIZE) & ((1u << 32u - CLUSTER_COUNT_SIZE) - 1u);
+        (offset_and_count >> CLUSTER_COUNT_SIZE) & ((1u << 32u - CLUSTER_COUNT_SIZE) - 1u),
-    // The count is stored in the lower 8 bits
+        // The count is stored in the lower CLUSTER_COUNT_SIZE = 13 bits
-    output.count = offset_and_count & ((1u << CLUSTER_COUNT_SIZE) - 1u);
+        offset_and_count & ((1u << CLUSTER_COUNT_SIZE) - 1u)
-    return output;
+    );
 #else
    return cluster_offsets_and_counts.data[cluster_index];
 #endif
 }
 fn get_light_id(index: u32) -> u32 {
 #ifdef NO_STORAGE_BUFFERS_SUPPORT
    // The index is correct but in cluster_light_index_lists we pack 4 u8s into a u32
    // This means the index into cluster_light_index_lists is index / 4
    let indices = cluster_light_index_lists.data[index >> 4u][(index >> 2u) & ((1u << 2u) - 1u)];
    // And index % 4 gives the sub-index of the u8 within the u32 so we shift by 8 * sub-index
    return (indices >> (8u * (index & ((1u << 2u) - 1u)))) & ((1u << 8u) - 1u);
 #else
    return cluster_light_index_lists.data[index];
 #endif
 }
 fn point_light(
@ -583,7 +586,7 @@ fn fragment(in: FragmentInput) -> [[location(0)]] vec4<f32> {
        ), in.world_position);
        let cluster_index = fragment_cluster_index(in.frag_coord.xy, view_z, is_orthographic);
        let offset_and_count = unpack_offset_and_count(cluster_index);
-        for (var i: u32 = offset_and_count.offset; i < offset_and_count.offset + offset_and_count.count; i = i + 1u) {
+        for (var i: u32 = offset_and_count[0]; i < offset_and_count[0] + offset_and_count[1]; i = i + 1u) {
            let light_id = get_light_id(i);
            let light = point_lights.data[light_id];
            var shadow: f32 = 1.0;
@ -637,9 +640,9 @@ fn fragment(in: FragmentInput) -> [[location(0)]] vec4<f32> {
        let cluster_overlay_alpha = 0.1;
        let max_light_complexity_per_cluster = 64.0;
        output_color.r = (1.0 - cluster_overlay_alpha) * output_color.r
-            + cluster_overlay_alpha * smoothStep(0.0, max_light_complexity_per_cluster, f32(offset_and_count.count));
+            + cluster_overlay_alpha * smoothStep(0.0, max_light_complexity_per_cluster, f32(offset_and_count[1]));
        output_color.g = (1.0 - cluster_overlay_alpha) * output_color.g
-            + cluster_overlay_alpha * (1.0 - smoothStep(0.0, max_light_complexity_per_cluster, f32(offset_and_count.count)));
+            + cluster_overlay_alpha * (1.0 - smoothStep(0.0, max_light_complexity_per_cluster, f32(offset_and_count[1])));
 #endif // CLUSTERED_FORWARD_DEBUG_CLUSTER_LIGHT_COMPLEXITY
 #ifdef CLUSTERED_FORWARD_DEBUG_CLUSTER_COHERENCY
        // NOTE: Visualizes the cluster to which the fragment belongs
--- a/crates/bevy_render/src/render_resource/storage_buffer.rs
+++ b/crates/bevy_render/src/render_resource/storage_buffer.rs
@ -1,13 +1,10 @@
-use std::num::NonZeroU64;
+use super::Buffer;
-
+use crate::renderer::{RenderDevice, RenderQueue};
 use bevy_crevice::std430::{self, AsStd430, Std430};
 use bevy_utils::tracing::warn;
 use std::num::NonZeroU64;
 use wgpu::{BindingResource, BufferBinding, BufferDescriptor, BufferUsages};
 use crate::renderer::{RenderDevice, RenderQueue};
 use super::Buffer;
 /// A helper for a storage buffer binding with a body, or a variable-sized array, or both.
 pub struct StorageBuffer<T: AsStd430, U: AsStd430 = ()> {
    body: U,
@ -126,4 +123,19 @@ impl<T: AsStd430, U: AsStd430> StorageBuffer<T, U> {
    pub fn values_mut(&mut self) -> &mut [T] {
        &mut self.values
    }
    #[inline]
    pub fn clear(&mut self) {
        self.values.clear();
    }
    #[inline]
    pub fn push(&mut self, value: T) {
        self.values.push(value);
    }
    #[inline]
    pub fn append(&mut self, values: &mut Vec<T>) {
        self.values.append(values);
    }
 }
--- a/crates/bevy_render/src/renderer/render_device.rs
+++ b/crates/bevy_render/src/renderer/render_device.rs
@ -4,7 +4,7 @@ use crate::render_resource::{
 };
 use futures_lite::future;
 use std::sync::Arc;
-use wgpu::util::DeviceExt;
+use wgpu::{util::DeviceExt, BufferBindingType};
 use super::RenderQueue;
@ -184,4 +184,15 @@ impl RenderDevice {
        let padded_bytes_per_row_padding = (align - row_bytes % align) % align;
        row_bytes + padded_bytes_per_row_padding
    }
    pub fn get_supported_read_only_binding_type(
        &self,
        buffers_per_shader_stage: u32,
    ) -> BufferBindingType {
        if self.limits().max_storage_buffers_per_shader_stage >= buffers_per_shader_stage {
            BufferBindingType::Storage { read_only: true }
        } else {
            BufferBindingType::Uniform
        }
    }
 }
--- a/crates/bevy_sprite/src/mesh2d/material.rs
+++ b/crates/bevy_sprite/src/mesh2d/material.rs
@ -86,7 +86,11 @@ impl<M: Material2d> SpecializedMaterial2d for M {
    type Key = ();
    #[inline]
-    fn key(_material: &<Self as RenderAsset>::PreparedAsset) -> Self::Key {}
+    fn key(
        _render_device: &RenderDevice,
        _material: &<Self as RenderAsset>::PreparedAsset,
    ) -> Self::Key {
    }
    #[inline]
    fn specialize(
@ -136,7 +140,10 @@ pub trait SpecializedMaterial2d: Asset + RenderAsset {
    /// Extract the [`SpecializedMaterial2d::Key`] for the "prepared" version of this material. This key will be
    /// passed in to the [`SpecializedMaterial2d::specialize`] function when compiling the [`RenderPipeline`](bevy_render::render_resource::RenderPipeline)
    /// for a given entity's material.
-    fn key(material: &<Self as RenderAsset>::PreparedAsset) -> Self::Key;
+    fn key(
        render_device: &RenderDevice,
        material: &<Self as RenderAsset>::PreparedAsset,
    ) -> Self::Key;
    /// Specializes the given `descriptor` according to the given `key`.
    fn specialize(
@ -292,6 +299,7 @@ pub fn queue_material2d_meshes<M: SpecializedMaterial2d>(
    material2d_pipeline: Res<Material2dPipeline<M>>,
    mut pipelines: ResMut<SpecializedMeshPipelines<Material2dPipeline<M>>>,
    mut pipeline_cache: ResMut<PipelineCache>,
    render_device: Res<RenderDevice>,
    msaa: Res<Msaa>,
    render_meshes: Res<RenderAssets<Mesh>>,
    render_materials: Res<RenderAssets<M>>,
@ -301,6 +309,7 @@ pub fn queue_material2d_meshes<M: SpecializedMaterial2d>(
    if material2d_meshes.is_empty() {
        return;
    }
    let render_device = render_device.into_inner();
    for (visible_entities, mut transparent_phase) in views.iter_mut() {
        let draw_transparent_pbr = transparent_draw_functions
            .read()
@ -318,7 +327,7 @@ pub fn queue_material2d_meshes<M: SpecializedMaterial2d>(
                        let mesh_key = msaa_key
                            | Mesh2dPipelineKey::from_primitive_topology(mesh.primitive_topology);
-                        let material_key = M::key(material2d);
+                        let material_key = M::key(render_device, material2d);
                        let pipeline_id = pipelines.specialize(
                            &mut pipeline_cache,
                            &material2d_pipeline,
--- a/examples/README.md
+++ b/examples/README.md
@ -443,4 +443,5 @@ cargo run --release --example <example name>
 Example | File | Description
 --- | --- | ---
 `many_lights` | [`stress_tests/many_lights.rs`](./stress_tests/many_lights.rs) | Simple benchmark to test rendering many point lights. Run with `WGPU_SETTINGS_PRIO=webgl2` to restrict to uniform buffers and max 256 lights.
 `transform_hierarchy.rs` | [`stress_tests/transform_hierarchy.rs`](./stress_tests/transform_hierarchy.rs) | Various test cases for hierarchy and transform propagation performance
--- a/examples/shader/custom_vertex_attribute.rs
+++ b/examples/shader/custom_vertex_attribute.rs
@ -137,6 +137,7 @@ impl Material for CustomMaterial {
    }
    fn specialize(
        _pipeline: &MaterialPipeline<Self>,
        descriptor: &mut RenderPipelineDescriptor,
        layout: &MeshVertexBufferLayout,
    ) -> Result<(), SpecializedMeshPipelineError> {
--- a/examples/shader/shader_material_glsl.rs
+++ b/examples/shader/shader_material_glsl.rs
@ -97,6 +97,7 @@ impl SpecializedMaterial for CustomMaterial {
    fn key(_: &<CustomMaterial as RenderAsset>::PreparedAsset) -> Self::Key {}
    fn specialize(
        _pipeline: &MaterialPipeline<Self>,
        descriptor: &mut RenderPipelineDescriptor,
        _: Self::Key,
        _layout: &MeshVertexBufferLayout,
--- a/examples/stress_tests/many_lights.rs
+++ b/examples/stress_tests/many_lights.rs
@ -0,0 +1,166 @@
 use bevy::{
    diagnostic::{FrameTimeDiagnosticsPlugin, LogDiagnosticsPlugin},
    math::{DVec2, DVec3},
    pbr::{ExtractedPointLight, GlobalLightMeta},
    prelude::*,
    render::{RenderApp, RenderStage},
 };
 fn main() {
    App::new()
        .insert_resource(WindowDescriptor {
            width: 1024.0,
            height: 768.0,
            title: "many_lights".to_string(),
            present_mode: bevy::window::PresentMode::Immediate,
            ..default()
        })
        .add_plugins(DefaultPlugins)
        .add_plugin(FrameTimeDiagnosticsPlugin::default())
        .add_plugin(LogDiagnosticsPlugin::default())
        .add_startup_system(setup)
        .add_system(move_camera)
        .add_system(print_light_count)
        .add_plugin(LogVisibleLights)
        .run();
 }
 fn setup(
    mut commands: Commands,
    mut meshes: ResMut<Assets<Mesh>>,
    mut materials: ResMut<Assets<StandardMaterial>>,
 ) {
    const LIGHT_RADIUS: f32 = 0.3;
    const LIGHT_INTENSITY: f32 = 5.0;
    const RADIUS: f32 = 50.0;
    const N_LIGHTS: usize = 100_000;
    commands.spawn_bundle(PbrBundle {
        mesh: meshes.add(Mesh::from(shape::Icosphere {
            radius: RADIUS,
            subdivisions: 9,
        })),
        material: materials.add(StandardMaterial::from(Color::WHITE)),
        transform: Transform::from_scale(Vec3::splat(-1.0)),
        ..default()
    });
    let mesh = meshes.add(Mesh::from(shape::Cube { size: 1.0 }));
    let material = materials.add(StandardMaterial {
        base_color: Color::PINK,
        ..default()
    });
    // NOTE: This pattern is good for testing performance of culling as it provides roughly
    // the same number of visible meshes regardless of the viewing angle.
    // NOTE: f64 is used to avoid precision issues that produce visual artifacts in the distribution
    let golden_ratio = 0.5f64 * (1.0f64 + 5.0f64.sqrt());
    for i in 0..N_LIGHTS {
        let spherical_polar_theta_phi = fibonacci_spiral_on_sphere(golden_ratio, i, N_LIGHTS);
        let unit_sphere_p = spherical_polar_to_cartesian(spherical_polar_theta_phi);
        commands.spawn_bundle(PointLightBundle {
            point_light: PointLight {
                range: LIGHT_RADIUS,
                intensity: LIGHT_INTENSITY,
                ..default()
            },
            transform: Transform::from_translation((RADIUS as f64 * unit_sphere_p).as_vec3()),
            ..default()
        });
    }
    // camera
    commands.spawn_bundle(PerspectiveCameraBundle::default());
    // add one cube, the only one with strong handles
    // also serves as a reference point during rotation
    commands.spawn_bundle(PbrBundle {
        mesh,
        material,
        transform: Transform {
            translation: Vec3::new(0.0, RADIUS as f32, 0.0),
            scale: Vec3::splat(5.0),
            ..default()
        },
        ..default()
    });
 }
 // NOTE: This epsilon value is apparently optimal for optimizing for the average
 // nearest-neighbor distance. See:
 // http://extremelearning.com.au/how-to-evenly-distribute-points-on-a-sphere-more-effectively-than-the-canonical-fibonacci-lattice/
 // for details.
 const EPSILON: f64 = 0.36;
 fn fibonacci_spiral_on_sphere(golden_ratio: f64, i: usize, n: usize) -> DVec2 {
    DVec2::new(
        2.0 * std::f64::consts::PI * (i as f64 / golden_ratio),
        (1.0 - 2.0 * (i as f64 + EPSILON) / (n as f64 - 1.0 + 2.0 * EPSILON)).acos(),
    )
 }
 fn spherical_polar_to_cartesian(p: DVec2) -> DVec3 {
    let (sin_theta, cos_theta) = p.x.sin_cos();
    let (sin_phi, cos_phi) = p.y.sin_cos();
    DVec3::new(cos_theta * sin_phi, sin_theta * sin_phi, cos_phi)
 }
 // System for rotating the camera
 fn move_camera(time: Res<Time>, mut camera_query: Query<&mut Transform, With<Camera>>) {
    let mut camera_transform = camera_query.single_mut();
    camera_transform.rotate(Quat::from_rotation_z(time.delta_seconds() * 0.15));
    camera_transform.rotate(Quat::from_rotation_x(time.delta_seconds() * 0.15));
 }
 // System for printing the number of meshes on every tick of the timer
 fn print_light_count(time: Res<Time>, mut timer: Local<PrintingTimer>, lights: Query<&PointLight>) {
    timer.0.tick(time.delta());
    if timer.0.just_finished() {
        info!("Lights: {}", lights.iter().len(),);
    }
 }
 struct LogVisibleLights;
 impl Plugin for LogVisibleLights {
    fn build(&self, app: &mut App) {
        let render_app = match app.get_sub_app_mut(RenderApp) {
            Ok(render_app) => render_app,
            Err(_) => return,
        };
        render_app
            .add_system_to_stage(RenderStage::Extract, extract_time)
            .add_system_to_stage(RenderStage::Prepare, print_visible_light_count);
    }
 }
 // System for printing the number of meshes on every tick of the timer
 fn print_visible_light_count(
    time: Res<Time>,
    mut timer: Local<PrintingTimer>,
    visible: Query<&ExtractedPointLight>,
    global_light_meta: Res<GlobalLightMeta>,
 ) {
    timer.0.tick(time.delta());
    if timer.0.just_finished() {
        info!(
            "Visible Lights: {}, Rendered Lights: {}",
            visible.iter().len(),
            global_light_meta.entity_to_index.len()
        );
    }
 }
 fn extract_time(mut commands: Commands, time: Res<Time>) {
    commands.insert_resource(time.into_inner().clone());
 }
 struct PrintingTimer(Timer);
 impl Default for PrintingTimer {
    fn default() -> Self {
        Self(Timer::from_seconds(1.0, true))
    }
 }