bevy/crates/bevy_render/src/render_resource/resource_macros.rs
Patrick Walton 4dadebd9c4
Improve performance by binning together opaque items instead of sorting them. (#12453)
Today, we sort all entities added to all phases, even the phases that
don't strictly need sorting, such as the opaque and shadow phases. This
results in a performance loss because our `PhaseItem`s are rather large
in memory, so sorting is slow. Additionally, determining the boundaries
of batches is an O(n) process.

This commit makes Bevy instead applicable place phase items into *bins*
keyed by *bin keys*, which have the invariant that everything in the
same bin is potentially batchable. This makes determining batch
boundaries O(1), because everything in the same bin can be batched.
Instead of sorting each entity, we now sort only the bin keys. This
drops the sorting time to near-zero on workloads with few bins like
`many_cubes --no-frustum-culling`. Memory usage is improved too, with
batch boundaries and dynamic indices now implicit instead of explicit.
The improved memory usage results in a significant win even on
unbatchable workloads like `many_cubes --no-frustum-culling
--vary-material-data-per-instance`, presumably due to cache effects.

Not all phases can be binned; some, such as transparent and transmissive
phases, must still be sorted. To handle this, this commit splits
`PhaseItem` into `BinnedPhaseItem` and `SortedPhaseItem`. Most of the
logic that today deals with `PhaseItem`s has been moved to
`SortedPhaseItem`. `BinnedPhaseItem` has the new logic.

Frame time results (in ms/frame) are as follows:

| Benchmark                | `binning` | `main`  | Speedup |
| ------------------------ | --------- | ------- | ------- |
| `many_cubes -nfc -vpi` | 232.179     | 312.123   | 34.43%  |
| `many_cubes -nfc`        | 25.874 | 30.117 | 16.40%  |
| `many_foxes`             | 3.276 | 3.515 | 7.30%   |

(`-nfc` is short for `--no-frustum-culling`; `-vpi` is short for
`--vary-per-instance`.)

---

## Changelog

### Changed

* Render phases have been split into binned and sorted phases. Binned
phases, such as the common opaque phase, achieve improved CPU
performance by avoiding the sorting step.

## Migration Guide

- `PhaseItem` has been split into `BinnedPhaseItem` and
`SortedPhaseItem`. If your code has custom `PhaseItem`s, you will need
to migrate them to one of these two types. `SortedPhaseItem` requires
the fewest code changes, but you may want to pick `BinnedPhaseItem` if
your phase doesn't require sorting, as that enables higher performance.

## Tracy graphs

`many-cubes --no-frustum-culling`, `main` branch:
<img width="1064" alt="Screenshot 2024-03-12 180037"
src="https://github.com/bevyengine/bevy/assets/157897/e1180ce8-8e89-46d2-85e3-f59f72109a55">

`many-cubes --no-frustum-culling`, this branch:
<img width="1064" alt="Screenshot 2024-03-12 180011"
src="https://github.com/bevyengine/bevy/assets/157897/0899f036-6075-44c5-a972-44d95895f46c">

You can see that `batch_and_prepare_binned_render_phase` is a much
smaller fraction of the time. Zooming in on that function, with yellow
being this branch and red being `main`, we see:

<img width="1064" alt="Screenshot 2024-03-12 175832"
src="https://github.com/bevyengine/bevy/assets/157897/0dfc8d3f-49f4-496e-8825-a66e64d356d0">

The binning happens in `queue_material_meshes`. Again with yellow being
this branch and red being `main`:
<img width="1064" alt="Screenshot 2024-03-12 175755"
src="https://github.com/bevyengine/bevy/assets/157897/b9b20dc1-11c8-400c-a6cc-1c2e09c1bb96">

We can see that there is a small regression in `queue_material_meshes`
performance, but it's not nearly enough to outweigh the large gains in
`batch_and_prepare_binned_render_phase`.

---------

Co-authored-by: James Liu <contact@jamessliu.com>
2024-03-30 02:55:02 +00:00

187 lines
8.0 KiB
Rust

// structs containing wgpu types take a long time to compile. this is particularly bad for generic
// structs containing wgpu structs. we avoid that in debug builds (and for cargo check and rust analyzer)
// by type-erasing with the `render_resource_wrapper` macro. The resulting type behaves like Arc<$wgpu_type>,
// but avoids explicitly storing an Arc<$wgpu_type> member.
// analysis from https://github.com/bevyengine/bevy/pull/5950#issuecomment-1243473071 indicates this is
// due to `evaluate_obligations`. we should check if this can be removed after a fix lands for
// https://github.com/rust-lang/rust/issues/99188 (and after other `evaluate_obligations`-related changes).
#[cfg(debug_assertions)]
#[macro_export]
macro_rules! render_resource_wrapper {
($wrapper_type:ident, $wgpu_type:ty) => {
#[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
#[derive(Debug)]
// SAFETY: while self is live, self.0 comes from `into_raw` of an Arc<$wgpu_type> with a strong ref.
pub struct $wrapper_type(*const ());
#[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
#[derive(Debug)]
pub struct $wrapper_type(send_wrapper::SendWrapper<*const ()>);
impl $wrapper_type {
pub fn new(value: $wgpu_type) -> Self {
let arc = std::sync::Arc::new(value);
let value_ptr = std::sync::Arc::into_raw(arc);
let unit_ptr = value_ptr.cast::<()>();
#[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
return Self(unit_ptr);
#[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
return Self(send_wrapper::SendWrapper::new(unit_ptr));
}
pub fn try_unwrap(self) -> Option<$wgpu_type> {
let value_ptr = self.0.cast::<$wgpu_type>();
// SAFETY: pointer refers to a valid Arc, and was created from Arc::into_raw.
let arc = unsafe { std::sync::Arc::from_raw(value_ptr) };
// we forget ourselves here since the reconstructed arc will be dropped/decremented within this scope
std::mem::forget(self);
std::sync::Arc::try_unwrap(arc).ok()
}
}
impl std::ops::Deref for $wrapper_type {
type Target = $wgpu_type;
fn deref(&self) -> &Self::Target {
let value_ptr = self.0.cast::<$wgpu_type>();
// SAFETY: the arc lives for 'self, so the ref lives for 'self
let value_ref = unsafe { value_ptr.as_ref() };
value_ref.unwrap()
}
}
impl Drop for $wrapper_type {
fn drop(&mut self) {
let value_ptr = self.0.cast::<$wgpu_type>();
// SAFETY: pointer refers to a valid Arc, and was created from Arc::into_raw.
// this reconstructed arc is dropped/decremented within this scope.
unsafe { std::sync::Arc::from_raw(value_ptr) };
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
// SAFETY: We manually implement Send and Sync, which is valid for Arc<T> when T: Send + Sync.
// We ensure correctness by checking that $wgpu_type does implement Send and Sync.
// If in future there is a case where a wrapper is required for a non-send/sync type
// we can implement a macro variant that omits these manual Send + Sync impls
unsafe impl Send for $wrapper_type {}
#[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
// SAFETY: As explained above, we ensure correctness by checking that $wgpu_type implements Send and Sync.
unsafe impl Sync for $wrapper_type {}
#[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
const _: () = {
trait AssertSendSyncBound: Send + Sync {}
impl AssertSendSyncBound for $wgpu_type {}
};
impl Clone for $wrapper_type {
fn clone(&self) -> Self {
let value_ptr = self.0.cast::<$wgpu_type>();
// SAFETY: pointer refers to a valid Arc, and was created from Arc::into_raw.
let arc = unsafe { std::sync::Arc::from_raw(value_ptr.cast::<$wgpu_type>()) };
let cloned = std::sync::Arc::clone(&arc);
// we forget the reconstructed Arc to avoid decrementing the ref counter, as self is still live.
std::mem::forget(arc);
let cloned_value_ptr = std::sync::Arc::into_raw(cloned);
let cloned_unit_ptr = cloned_value_ptr.cast::<()>();
#[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
return Self(cloned_unit_ptr);
// Note: this implementation means that this Clone will panic
// when called off the wgpu thread.
#[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
return Self(send_wrapper::SendWrapper::new(cloned_unit_ptr));
}
}
};
}
#[cfg(not(debug_assertions))]
#[macro_export]
macro_rules! render_resource_wrapper {
($wrapper_type:ident, $wgpu_type:ty) => {
#[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
#[derive(Clone, Debug)]
pub struct $wrapper_type(std::sync::Arc<$wgpu_type>);
#[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
#[derive(Clone, Debug)]
pub struct $wrapper_type(std::sync::Arc<send_wrapper::SendWrapper<$wgpu_type>>);
impl $wrapper_type {
pub fn new(value: $wgpu_type) -> Self {
#[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
return Self(std::sync::Arc::new(value));
#[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
return Self(std::sync::Arc::new(send_wrapper::SendWrapper::new(value)));
}
pub fn try_unwrap(self) -> Option<$wgpu_type> {
#[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
return std::sync::Arc::try_unwrap(self.0).ok();
#[cfg(all(target_arch = "wasm32", target_feature = "atomics"))]
return std::sync::Arc::try_unwrap(self.0).ok().map(|p| p.take());
}
}
impl std::ops::Deref for $wrapper_type {
type Target = $wgpu_type;
fn deref(&self) -> &Self::Target {
self.0.as_ref()
}
}
#[cfg(not(all(target_arch = "wasm32", target_feature = "atomics")))]
const _: () = {
trait AssertSendSyncBound: Send + Sync {}
impl AssertSendSyncBound for $wgpu_type {}
};
};
}
#[macro_export]
macro_rules! define_atomic_id {
($atomic_id_type:ident) => {
#[derive(Copy, Clone, Hash, Eq, PartialEq, PartialOrd, Ord, Debug)]
pub struct $atomic_id_type(core::num::NonZeroU32);
// We use new instead of default to indicate that each ID created will be unique.
#[allow(clippy::new_without_default)]
impl $atomic_id_type {
pub fn new() -> Self {
use std::sync::atomic::{AtomicU32, Ordering};
static COUNTER: AtomicU32 = AtomicU32::new(1);
let counter = COUNTER.fetch_add(1, Ordering::Relaxed);
Self(core::num::NonZeroU32::new(counter).unwrap_or_else(|| {
panic!(
"The system ran out of unique `{}`s.",
stringify!($atomic_id_type)
);
}))
}
}
impl From<$atomic_id_type> for core::num::NonZeroU32 {
fn from(value: $atomic_id_type) -> Self {
value.0
}
}
impl From<core::num::NonZeroU32> for $atomic_id_type {
fn from(value: core::num::NonZeroU32) -> Self {
Self(value)
}
}
};
}
pub use render_resource_wrapper;