Use instancing for sprites (#9597)

# Objective

- Supercedes #8872 
- Improve sprite rendering performance after the regression in #9236 

## Solution

- Use an instance-rate vertex buffer to store per-instance data.
- Store color, UV offset and scale, and a transform per instance.
- Convert Sprite rect, custom_size, anchor, and flip_x/_y to an affine
3x4 matrix and store the transpose of that in the per-instance data.
This is similar to how MeshUniform uses transpose affine matrices.
- Use a special index buffer that has batches of 6 indices referencing 4
vertices. The lower 2 bits indicate the x and y of a quad such that the
corners are:
  ```
  10    11

  00    01
  ```
UVs are implicit but get modified by UV offset and scale The remaining
upper bits contain the instance index.

## Benchmarks

I will compare versus `main` before #9236 because the results should be
as good as or faster than that. Running `bevymark -- 10000 16` on an M1
Max with `main` at `e8b38925` in yellow, this PR in red:

![Screenshot 2023-08-27 at 18 44
10](https://github.com/bevyengine/bevy/assets/302146/bdc5c929-d547-44bb-b519-20dce676a316)

Looking at the median frame times, that's a 37% reduction from before.

---

## Changelog

- Changed: Improved sprite rendering performance by leveraging an
instance-rate vertex buffer.

---------

Co-authored-by: Giacomo Stevanato <giaco.stevanato@gmail.com>
This commit is contained in:
Robert Swain 2023-09-02 20:03:19 +02:00 committed by GitHub
parent 40c6b3b91e
commit 4fdea02087
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 280 additions and 231 deletions

View File

@ -4,26 +4,7 @@
#import bevy_pbr::mesh_bindings mesh
#import bevy_pbr::mesh_types MESH_FLAGS_SIGN_DETERMINANT_MODEL_3X3_BIT
#import bevy_render::instance_index get_instance_index
fn affine_to_square(affine: mat3x4<f32>) -> mat4x4<f32> {
return transpose(mat4x4<f32>(
affine[0],
affine[1],
affine[2],
vec4<f32>(0.0, 0.0, 0.0, 1.0),
));
}
fn mat2x4_f32_to_mat3x3_unpack(
a: mat2x4<f32>,
b: f32,
) -> mat3x3<f32> {
return mat3x3<f32>(
a[0].xyz,
vec3<f32>(a[0].w, a[1].xy),
vec3<f32>(a[1].zw, b),
);
}
#import bevy_render::maths affine_to_square, mat2x4_f32_to_mat3x3_unpack
fn get_model_matrix(instance_index: u32) -> mat4x4<f32> {
return affine_to_square(mesh[get_instance_index(instance_index)].model);

View File

@ -2,7 +2,7 @@
struct Mesh {
// Affine 4x3 matrices transposed to 3x4
// Use bevy_pbr::mesh_functions::affine_to_square to unpack
// Use bevy_render::maths::affine_to_square to unpack
model: mat3x4<f32>,
previous_model: mat3x4<f32>,
// 3x3 matrix packed in mat2x4 and f32 as:

View File

@ -234,6 +234,8 @@ pub struct RenderApp;
pub const INSTANCE_INDEX_SHADER_HANDLE: HandleUntyped =
HandleUntyped::weak_from_u64(Shader::TYPE_UUID, 10313207077636615845);
pub const MATHS_SHADER_HANDLE: HandleUntyped =
HandleUntyped::weak_from_u64(Shader::TYPE_UUID, 10665356303104593376);
impl Plugin for RenderPlugin {
/// Initializes the renderer, sets up the [`RenderSet`](RenderSet) and creates the rendering sub-app.
@ -391,6 +393,7 @@ impl Plugin for RenderPlugin {
"BASE_INSTANCE_WORKAROUND".into()
]
);
load_internal_asset!(app, MATHS_SHADER_HANDLE, "maths.wgsl", Shader::from_wgsl);
if let Some(future_renderer_resources) =
app.world.remove_resource::<FutureRendererResources>()
{

View File

@ -0,0 +1,21 @@
#define_import_path bevy_render::maths
fn affine_to_square(affine: mat3x4<f32>) -> mat4x4<f32> {
return transpose(mat4x4<f32>(
affine[0],
affine[1],
affine[2],
vec4<f32>(0.0, 0.0, 0.0, 1.0),
));
}
fn mat2x4_f32_to_mat3x3_unpack(
a: mat2x4<f32>,
b: f32,
) -> mat3x3<f32> {
return mat3x3<f32>(
a[0].xyz,
vec3<f32>(a[0].w, a[1].xy),
vec3<f32>(a[1].zw, b),
);
}

View File

@ -144,6 +144,14 @@ impl<T: Pod> BufferVec<T> {
pub fn clear(&mut self) {
self.values.clear();
}
pub fn values(&self) -> &Vec<T> {
&self.values
}
pub fn values_mut(&mut self) -> &mut Vec<T> {
&mut self.values
}
}
impl<T: Pod> Extend<T> for BufferVec<T> {

View File

@ -14,7 +14,7 @@ use bevy_ecs::{
storage::SparseSet,
system::{lifetimeless::*, SystemParamItem, SystemState},
};
use bevy_math::{Rect, Vec2};
use bevy_math::{Affine3A, Quat, Rect, Vec2, Vec4};
use bevy_render::{
color::Color,
render_asset::RenderAssets,
@ -201,26 +201,7 @@ impl SpecializedRenderPipeline for SpritePipeline {
type Key = SpritePipelineKey;
fn specialize(&self, key: Self::Key) -> RenderPipelineDescriptor {
let mut formats = vec![
// position
VertexFormat::Float32x3,
// uv
VertexFormat::Float32x2,
];
if key.contains(SpritePipelineKey::COLORED) {
// color
formats.push(VertexFormat::Float32x4);
}
let vertex_layout =
VertexBufferLayout::from_vertex_formats(VertexStepMode::Vertex, formats);
let mut shader_defs = Vec::new();
if key.contains(SpritePipelineKey::COLORED) {
shader_defs.push("COLORED".into());
}
if key.contains(SpritePipelineKey::TONEMAP_IN_SHADER) {
shader_defs.push("TONEMAP_IN_SHADER".into());
@ -256,12 +237,49 @@ impl SpecializedRenderPipeline for SpritePipeline {
false => TextureFormat::bevy_default(),
};
let instance_rate_vertex_buffer_layout = VertexBufferLayout {
array_stride: 80,
step_mode: VertexStepMode::Instance,
attributes: vec![
// @location(0) i_model_transpose_col0: vec4<f32>,
VertexAttribute {
format: VertexFormat::Float32x4,
offset: 0,
shader_location: 0,
},
// @location(1) i_model_transpose_col1: vec4<f32>,
VertexAttribute {
format: VertexFormat::Float32x4,
offset: 16,
shader_location: 1,
},
// @location(2) i_model_transpose_col2: vec4<f32>,
VertexAttribute {
format: VertexFormat::Float32x4,
offset: 32,
shader_location: 2,
},
// @location(3) i_color: vec4<f32>,
VertexAttribute {
format: VertexFormat::Float32x4,
offset: 48,
shader_location: 3,
},
// @location(4) i_uv_offset_scale: vec4<f32>,
VertexAttribute {
format: VertexFormat::Float32x4,
offset: 64,
shader_location: 4,
},
],
};
RenderPipelineDescriptor {
vertex: VertexState {
shader: SPRITE_SHADER_HANDLE.typed::<Shader>(),
entry_point: "vertex".into(),
shader_defs: shader_defs.clone(),
buffers: vec![vertex_layout],
buffers: vec![instance_rate_vertex_buffer_layout],
},
fragment: Some(FragmentState {
shader: SPRITE_SHADER_HANDLE.typed::<Shader>(),
@ -365,6 +383,8 @@ pub fn extract_sprites(
)>,
>,
) {
extracted_sprites.sprites.clear();
for (entity, view_visibility, sprite, transform, handle) in sprite_query.iter() {
if !view_visibility.get() {
continue;
@ -425,57 +445,50 @@ pub fn extract_sprites(
#[repr(C)]
#[derive(Copy, Clone, Pod, Zeroable)]
struct SpriteVertex {
pub position: [f32; 3],
pub uv: [f32; 2],
struct SpriteInstance {
// Affine 4x3 transposed to 3x4
pub i_model_transpose: [Vec4; 3],
pub i_color: [f32; 4],
pub i_uv_offset_scale: [f32; 4],
}
#[repr(C)]
#[derive(Copy, Clone, Pod, Zeroable)]
struct ColoredSpriteVertex {
pub position: [f32; 3],
pub uv: [f32; 2],
pub color: [f32; 4],
impl SpriteInstance {
#[inline]
fn from(transform: &Affine3A, color: &Color, uv_offset_scale: &Vec4) -> Self {
let transpose_model_3x3 = transform.matrix3.transpose();
Self {
i_model_transpose: [
transpose_model_3x3.x_axis.extend(transform.translation.x),
transpose_model_3x3.y_axis.extend(transform.translation.y),
transpose_model_3x3.z_axis.extend(transform.translation.z),
],
i_color: color.as_linear_rgba_f32(),
i_uv_offset_scale: uv_offset_scale.to_array(),
}
}
}
#[derive(Resource)]
pub struct SpriteMeta {
vertices: BufferVec<SpriteVertex>,
colored_vertices: BufferVec<ColoredSpriteVertex>,
view_bind_group: Option<BindGroup>,
sprite_index_buffer: BufferVec<u32>,
sprite_instance_buffer: BufferVec<SpriteInstance>,
}
impl Default for SpriteMeta {
fn default() -> Self {
Self {
vertices: BufferVec::new(BufferUsages::VERTEX),
colored_vertices: BufferVec::new(BufferUsages::VERTEX),
view_bind_group: None,
sprite_index_buffer: BufferVec::<u32>::new(BufferUsages::INDEX),
sprite_instance_buffer: BufferVec::<SpriteInstance>::new(BufferUsages::VERTEX),
}
}
}
const QUAD_INDICES: [usize; 6] = [0, 2, 3, 0, 1, 2];
const QUAD_VERTEX_POSITIONS: [Vec2; 4] = [
Vec2::new(-0.5, -0.5),
Vec2::new(0.5, -0.5),
Vec2::new(0.5, 0.5),
Vec2::new(-0.5, 0.5),
];
const QUAD_UVS: [Vec2; 4] = [
Vec2::new(0., 1.),
Vec2::new(1., 1.),
Vec2::new(1., 0.),
Vec2::new(0., 0.),
];
#[derive(Component)]
#[derive(Component, PartialEq, Eq, Clone)]
pub struct SpriteBatch {
range: Range<u32>,
image_handle_id: HandleId,
colored: bool,
range: Range<u32>,
}
#[derive(Resource, Default)]
@ -591,7 +604,7 @@ pub fn prepare_sprites(
sprite_pipeline: Res<SpritePipeline>,
mut image_bind_groups: ResMut<ImageBindGroups>,
gpu_images: Res<RenderAssets<Image>>,
mut extracted_sprites: ResMut<ExtractedSprites>,
extracted_sprites: Res<ExtractedSprites>,
mut phases: Query<&mut RenderPhase<Transparent2d>>,
events: Res<SpriteAssetEvents>,
) {
@ -607,11 +620,9 @@ pub fn prepare_sprites(
if let Some(view_binding) = view_uniforms.uniforms.binding() {
let mut batches: Vec<(Entity, SpriteBatch)> = Vec::with_capacity(*previous_len);
let sprite_meta = &mut sprite_meta;
// Clear the vertex buffers
sprite_meta.vertices.clear();
sprite_meta.colored_vertices.clear();
// Clear the sprite instances
sprite_meta.sprite_instance_buffer.clear();
sprite_meta.view_bind_group = Some(render_device.create_bind_group(&BindGroupDescriptor {
entries: &[BindGroupEntry {
@ -622,9 +633,8 @@ pub fn prepare_sprites(
layout: &sprite_pipeline.view_layout,
}));
// Vertex buffer indices
// Index buffer indices
let mut index = 0;
let mut colored_index = 0;
let image_bind_groups = &mut *image_bind_groups;
@ -632,144 +642,150 @@ pub fn prepare_sprites(
let mut batch_item_index = 0;
let mut batch_image_size = Vec2::ZERO;
let mut batch_image_handle = HandleId::Id(Uuid::nil(), u64::MAX);
let mut batch_colored = false;
// Iterate through the phase items and detect when successive sprites that can be batched.
// Spawn an entity with a `SpriteBatch` component for each possible batch.
// Compatible items share the same entity.
for item_index in 0..transparent_phase.items.len() {
let item = &mut transparent_phase.items[item_index];
if let Some(extracted_sprite) = extracted_sprites.sprites.get(item.entity) {
// Take a reference to an existing compatible batch if one exists
let mut existing_batch = batches.last_mut().filter(|_| {
batch_image_handle == extracted_sprite.image_handle_id
&& batch_colored == (extracted_sprite.color != Color::WHITE)
});
if existing_batch.is_none() {
if let Some(gpu_image) =
gpu_images.get(&Handle::weak(extracted_sprite.image_handle_id))
{
batch_item_index = item_index;
batch_image_size = Vec2::new(gpu_image.size.x, gpu_image.size.y);
batch_image_handle = extracted_sprite.image_handle_id;
batch_colored = extracted_sprite.color != Color::WHITE;
let new_batch = SpriteBatch {
range: if batch_colored {
colored_index..colored_index
} else {
index..index
},
colored: batch_colored,
image_handle_id: batch_image_handle,
};
batches.push((item.entity, new_batch));
image_bind_groups
.values
.entry(Handle::weak(batch_image_handle))
.or_insert_with(|| {
render_device.create_bind_group(&BindGroupDescriptor {
entries: &[
BindGroupEntry {
binding: 0,
resource: BindingResource::TextureView(
&gpu_image.texture_view,
),
},
BindGroupEntry {
binding: 1,
resource: BindingResource::Sampler(
&gpu_image.sampler,
),
},
],
label: Some("sprite_material_bind_group"),
layout: &sprite_pipeline.material_layout,
})
});
existing_batch = batches.last_mut();
} else {
continue;
}
}
// Calculate vertex data for this item
let mut uvs = QUAD_UVS;
if extracted_sprite.flip_x {
uvs = [uvs[1], uvs[0], uvs[3], uvs[2]];
}
if extracted_sprite.flip_y {
uvs = [uvs[3], uvs[2], uvs[1], uvs[0]];
}
// By default, the size of the quad is the size of the texture
let mut quad_size = batch_image_size;
// If a rect is specified, adjust UVs and the size of the quad
if let Some(rect) = extracted_sprite.rect {
let rect_size = rect.size();
for uv in &mut uvs {
*uv = (rect.min + *uv * rect_size) / batch_image_size;
}
quad_size = rect_size;
}
// Override the size if a custom one is specified
if let Some(custom_size) = extracted_sprite.custom_size {
quad_size = custom_size;
}
// Apply size and global transform
let positions = QUAD_VERTEX_POSITIONS.map(|quad_pos| {
extracted_sprite
.transform
.transform_point(
((quad_pos - extracted_sprite.anchor) * quad_size).extend(0.),
)
.into()
});
// Store the vertex data and add the item to the render phase
if batch_colored {
let vertex_color = extracted_sprite.color.as_linear_rgba_f32();
for i in QUAD_INDICES {
sprite_meta.colored_vertices.push(ColoredSpriteVertex {
position: positions[i],
uv: uvs[i].into(),
color: vertex_color,
});
}
colored_index += QUAD_INDICES.len() as u32;
existing_batch.unwrap().1.range.end = colored_index;
} else {
for i in QUAD_INDICES {
sprite_meta.vertices.push(SpriteVertex {
position: positions[i],
uv: uvs[i].into(),
});
}
index += QUAD_INDICES.len() as u32;
existing_batch.unwrap().1.range.end = index;
}
transparent_phase.items[batch_item_index].batch_size += 1;
} else {
let item = &transparent_phase.items[item_index];
let Some(extracted_sprite) = extracted_sprites.sprites.get(item.entity) else {
// If there is a phase item that is not a sprite, then we must start a new
// batch to draw the other phase item(s) and to respect draw order. This can be
// done by invalidating the batch_image_handle
batch_image_handle = HandleId::Id(Uuid::nil(), u64::MAX);
continue;
};
let batch_image_changed = batch_image_handle != extracted_sprite.image_handle_id;
if batch_image_changed {
let Some(gpu_image) =
gpu_images.get(&Handle::weak(extracted_sprite.image_handle_id))
else {
continue;
};
batch_image_size = Vec2::new(gpu_image.size.x, gpu_image.size.y);
batch_image_handle = extracted_sprite.image_handle_id;
image_bind_groups
.values
.entry(Handle::weak(batch_image_handle))
.or_insert_with(|| {
render_device.create_bind_group(&BindGroupDescriptor {
entries: &[
BindGroupEntry {
binding: 0,
resource: BindingResource::TextureView(
&gpu_image.texture_view,
),
},
BindGroupEntry {
binding: 1,
resource: BindingResource::Sampler(&gpu_image.sampler),
},
],
label: Some("sprite_material_bind_group"),
layout: &sprite_pipeline.material_layout,
})
});
}
// By default, the size of the quad is the size of the texture
let mut quad_size = batch_image_size;
// Calculate vertex data for this item
let mut uv_offset_scale: Vec4;
// If a rect is specified, adjust UVs and the size of the quad
if let Some(rect) = extracted_sprite.rect {
let rect_size = rect.size();
uv_offset_scale = Vec4::new(
rect.min.x / batch_image_size.x,
rect.max.y / batch_image_size.y,
rect_size.x / batch_image_size.x,
-rect_size.y / batch_image_size.y,
);
quad_size = rect_size;
} else {
uv_offset_scale = Vec4::new(0.0, 1.0, 1.0, -1.0);
}
if extracted_sprite.flip_x {
uv_offset_scale.x += uv_offset_scale.z;
uv_offset_scale.z *= -1.0;
}
if extracted_sprite.flip_y {
uv_offset_scale.y += uv_offset_scale.w;
uv_offset_scale.w *= -1.0;
}
// Override the size if a custom one is specified
if let Some(custom_size) = extracted_sprite.custom_size {
quad_size = custom_size;
}
let transform = extracted_sprite.transform.affine()
* Affine3A::from_scale_rotation_translation(
quad_size.extend(1.0),
Quat::IDENTITY,
(quad_size * (-extracted_sprite.anchor - Vec2::splat(0.5))).extend(0.0),
);
// Store the vertex data and add the item to the render phase
sprite_meta
.sprite_instance_buffer
.push(SpriteInstance::from(
&transform,
&extracted_sprite.color,
&uv_offset_scale,
));
if batch_image_changed {
batch_item_index = item_index;
batches.push((
item.entity,
SpriteBatch {
image_handle_id: batch_image_handle,
range: index..index,
},
));
}
transparent_phase.items[batch_item_index].batch_size += 1;
batches.last_mut().unwrap().1.range.end += 1;
index += 1;
}
}
sprite_meta
.vertices
.write_buffer(&render_device, &render_queue);
sprite_meta
.colored_vertices
.sprite_instance_buffer
.write_buffer(&render_device, &render_queue);
if sprite_meta.sprite_index_buffer.len() != 6 {
sprite_meta.sprite_index_buffer.clear();
// NOTE: This code is creating 6 indices pointing to 4 vertices.
// The vertices form the corners of a quad based on their two least significant bits.
// 10 11
//
// 00 01
// The sprite shader can then use the two least significant bits as the vertex index.
// The rest of the properties to transform the vertex positions and UVs (which are
// implicit) are baked into the instance transform, and UV offset and scale.
// See bevy_sprite/src/render/sprite.wgsl for the details.
sprite_meta.sprite_index_buffer.push(2);
sprite_meta.sprite_index_buffer.push(0);
sprite_meta.sprite_index_buffer.push(1);
sprite_meta.sprite_index_buffer.push(1);
sprite_meta.sprite_index_buffer.push(3);
sprite_meta.sprite_index_buffer.push(2);
sprite_meta
.sprite_index_buffer
.write_buffer(&render_device, &render_queue);
}
*previous_len = batches.len();
commands.insert_or_spawn_batch(batches);
}
extracted_sprites.sprites.clear();
}
pub type DrawSprite = (
@ -841,12 +857,20 @@ impl<P: PhaseItem> RenderCommand<P> for DrawSpriteBatch {
pass: &mut TrackedRenderPass<'w>,
) -> RenderCommandResult {
let sprite_meta = sprite_meta.into_inner();
if batch.colored {
pass.set_vertex_buffer(0, sprite_meta.colored_vertices.buffer().unwrap().slice(..));
} else {
pass.set_vertex_buffer(0, sprite_meta.vertices.buffer().unwrap().slice(..));
}
pass.draw(batch.range.clone(), 0..1);
pass.set_index_buffer(
sprite_meta.sprite_index_buffer.buffer().unwrap().slice(..),
0,
IndexFormat::Uint32,
);
pass.set_vertex_buffer(
0,
sprite_meta
.sprite_instance_buffer
.buffer()
.unwrap()
.slice(..),
);
pass.draw_indexed(0..6, 0, batch.range.clone());
RenderCommandResult::Success
}
}

View File

@ -2,33 +2,48 @@
#import bevy_core_pipeline::tonemapping
#endif
#import bevy_render::maths affine_to_square
#import bevy_render::view View
@group(0) @binding(0)
var<uniform> view: View;
struct VertexInput {
@builtin(vertex_index) index: u32,
// NOTE: Instance-rate vertex buffer members prefixed with i_
// NOTE: i_model_transpose_colN are the 3 columns of a 3x4 matrix that is the transpose of the
// affine 4x3 model matrix.
@location(0) i_model_transpose_col0: vec4<f32>,
@location(1) i_model_transpose_col1: vec4<f32>,
@location(2) i_model_transpose_col2: vec4<f32>,
@location(3) i_color: vec4<f32>,
@location(4) i_uv_offset_scale: vec4<f32>,
}
struct VertexOutput {
@builtin(position) clip_position: vec4<f32>,
@location(0) uv: vec2<f32>,
#ifdef COLORED
@location(1) color: vec4<f32>,
#endif
@builtin(position) position: vec4<f32>,
@location(1) @interpolate(flat) color: vec4<f32>,
};
@vertex
fn vertex(
@location(0) vertex_position: vec3<f32>,
@location(1) vertex_uv: vec2<f32>,
#ifdef COLORED
@location(2) vertex_color: vec4<f32>,
#endif
) -> VertexOutput {
fn vertex(in: VertexInput) -> VertexOutput {
var out: VertexOutput;
out.uv = vertex_uv;
out.position = view.view_proj * vec4<f32>(vertex_position, 1.0);
#ifdef COLORED
out.color = vertex_color;
#endif
let vertex_position = vec3<f32>(
f32(in.index & 0x1u),
f32((in.index & 0x2u) >> 1u),
0.0
);
out.clip_position = view.view_proj * affine_to_square(mat3x4<f32>(
in.i_model_transpose_col0,
in.i_model_transpose_col1,
in.i_model_transpose_col2,
)) * vec4<f32>(vertex_position, 1.0);
out.uv = vec2<f32>(vertex_position.xy) * in.i_uv_offset_scale.zw + in.i_uv_offset_scale.xy;
out.color = in.i_color;
return out;
}
@ -39,10 +54,7 @@ var sprite_sampler: sampler;
@fragment
fn fragment(in: VertexOutput) -> @location(0) vec4<f32> {
var color = textureSample(sprite_texture, sprite_sampler, in.uv);
#ifdef COLORED
color = in.color * color;
#endif
var color = in.color * textureSample(sprite_texture, sprite_sampler, in.uv);
#ifdef TONEMAP_IN_SHADER
color = bevy_core_pipeline::tonemapping::tone_mapping(color, view.color_grading);