From 8130b229be5d0d367388d4e1022dccafd1aa5c7b Mon Sep 17 00:00:00 2001 From: Aevyrie Date: Sat, 29 Mar 2025 19:43:39 -0700 Subject: [PATCH] Transform Propagation Optimization: Static Subtree Marking (#18589) # Objective - Optimize static scene performance by marking unchanged subtrees. - [bef0209](https://github.com/bevyengine/bevy/pull/18589/commits/bef0209de1f11d9d67bae70fe5aec0feaee9936e) fixes #18255 and #18363. - Closes #18365 - Includes change from #18321 ## Solution - Mark hierarchy subtrees with dirty bits to avoid transform propagation where not needed - This causes a performance regression when spawning many entities, or when the scene is entirely dynamic. - This results in massive speedups for largely static scenes. - In the future we could allow the user to change this behavior, or add some threshold based on how dynamic the scene is? ## Testing - Caldera Hotel scene --- .../src/components/transform.rs | 23 ++- crates/bevy_transform/src/plugins.rs | 60 +++--- crates/bevy_transform/src/systems.rs | 184 +++++++++--------- crates/bevy_ui/src/layout/mod.rs | 5 +- 4 files changed, 139 insertions(+), 133 deletions(-) diff --git a/crates/bevy_transform/src/components/transform.rs b/crates/bevy_transform/src/components/transform.rs index 197c9d1ac0..0ea9e1930f 100644 --- a/crates/bevy_transform/src/components/transform.rs +++ b/crates/bevy_transform/src/components/transform.rs @@ -63,7 +63,11 @@ fn assert_is_normalized(message: &str, length_squared: f32) { /// [transform_example]: https://github.com/bevyengine/bevy/blob/latest/examples/transforms/transform.rs #[derive(Debug, PartialEq, Clone, Copy)] #[cfg_attr(feature = "serialize", derive(serde::Serialize, serde::Deserialize))] -#[cfg_attr(feature = "bevy-support", derive(Component), require(GlobalTransform))] +#[cfg_attr( + feature = "bevy-support", + derive(Component), + require(GlobalTransform, TransformTreeChanged) +)] #[cfg_attr( feature = "bevy_reflect", derive(Reflect), @@ -644,3 +648,20 @@ impl Mul for Transform { self.transform_point(value) } } + +/// An optimization for transform propagation. This ZST marker component uses change detection to +/// mark all entities of the hierarchy as "dirty" if any of their descendants have a changed +/// `Transform`. If this component is *not* marked `is_changed()`, propagation will halt. +#[derive(Clone, Copy, Default, PartialEq, Debug)] +#[cfg_attr(feature = "serialize", derive(serde::Serialize, serde::Deserialize))] +#[cfg_attr(feature = "bevy-support", derive(Component))] +#[cfg_attr( + feature = "bevy_reflect", + derive(Reflect), + reflect(Component, Default, PartialEq, Debug) +)] +#[cfg_attr( + all(feature = "bevy_reflect", feature = "serialize"), + reflect(Serialize, Deserialize) +)] +pub struct TransformTreeChanged; diff --git a/crates/bevy_transform/src/plugins.rs b/crates/bevy_transform/src/plugins.rs index bee7211bdc..3b22aefb14 100644 --- a/crates/bevy_transform/src/plugins.rs +++ b/crates/bevy_transform/src/plugins.rs @@ -1,6 +1,4 @@ -use crate::systems::{ - compute_transform_leaves, propagate_parent_transforms, sync_simple_transforms, -}; +use crate::systems::{mark_dirty_trees, propagate_parent_transforms, sync_simple_transforms}; use bevy_app::{App, Plugin, PostStartup, PostUpdate}; use bevy_ecs::schedule::{IntoScheduleConfigs, SystemSet}; @@ -17,43 +15,33 @@ pub struct TransformPlugin; impl Plugin for TransformPlugin { fn build(&self, app: &mut App) { - // A set for `propagate_transforms` to mark it as ambiguous with `sync_simple_transforms`. - // Used instead of the `SystemTypeSet` as that would not allow multiple instances of the system. - #[derive(Debug, Hash, PartialEq, Eq, Clone, SystemSet)] - struct PropagateTransformsSet; - #[cfg(feature = "bevy_reflect")] app.register_type::() + .register_type::() .register_type::(); - app.configure_sets( - PostStartup, - PropagateTransformsSet.in_set(TransformSystem::TransformPropagate), - ) - // add transform systems to startup so the first update is "correct" - .add_systems( - PostStartup, - ( - propagate_parent_transforms, - (compute_transform_leaves, sync_simple_transforms) - .ambiguous_with(TransformSystem::TransformPropagate), + app + // add transform systems to startup so the first update is "correct" + .add_systems( + PostStartup, + ( + mark_dirty_trees, + propagate_parent_transforms, + sync_simple_transforms, + ) + .chain() + .in_set(TransformSystem::TransformPropagate), ) - .chain() - .in_set(PropagateTransformsSet), - ) - .configure_sets( - PostUpdate, - PropagateTransformsSet.in_set(TransformSystem::TransformPropagate), - ) - .add_systems( - PostUpdate, - ( - propagate_parent_transforms, - (compute_transform_leaves, sync_simple_transforms) // TODO: Adjust the internal parallel queries to make these parallel systems more efficiently share and fill CPU time. - .ambiguous_with(TransformSystem::TransformPropagate), - ) - .chain() - .in_set(PropagateTransformsSet), - ); + .add_systems( + PostUpdate, + ( + mark_dirty_trees, + propagate_parent_transforms, + // TODO: Adjust the internal parallel queries to make this system more efficiently share and fill CPU time. + sync_simple_transforms, + ) + .chain() + .in_set(TransformSystem::TransformPropagate), + ); } } diff --git a/crates/bevy_transform/src/systems.rs b/crates/bevy_transform/src/systems.rs index 568c8a6d6e..302dc311f2 100644 --- a/crates/bevy_transform/src/systems.rs +++ b/crates/bevy_transform/src/systems.rs @@ -1,6 +1,5 @@ -use crate::components::{GlobalTransform, Transform}; +use crate::components::{GlobalTransform, Transform, TransformTreeChanged}; use bevy_ecs::prelude::*; - #[cfg(feature = "std")] pub use parallel::propagate_parent_transforms; #[cfg(not(feature = "std"))] @@ -9,7 +8,7 @@ pub use serial::propagate_parent_transforms; /// Update [`GlobalTransform`] component of entities that aren't in the hierarchy /// /// Third party plugins should ensure that this is used in concert with -/// [`propagate_parent_transforms`] and [`compute_transform_leaves`]. +/// [`propagate_parent_transforms`] and [`mark_dirty_trees`]. pub fn sync_simple_transforms( mut query: ParamSet<( Query< @@ -41,28 +40,33 @@ pub fn sync_simple_transforms( } } -/// Compute leaf [`GlobalTransform`]s in parallel. -/// -/// This is run after [`propagate_parent_transforms`], to ensure the parents' [`GlobalTransform`]s -/// have been computed. This makes computing leaf nodes at different levels of the hierarchy much -/// more cache friendly, because data can be iterated over densely from the same archetype. -pub fn compute_transform_leaves( - parents: Query, With>, - mut leaves: Query<(Ref, &mut GlobalTransform, &ChildOf), Without>, +/// Optimization for static scenes. Propagates a "dirty bit" up the hierarchy towards ancestors. +/// Transform propagation can ignore entire subtrees of the hierarchy if it encounters an entity +/// without the dirty bit. +pub fn mark_dirty_trees( + changed_transforms: Query< + Entity, + Or<(Changed, Changed, Added)>, + >, + mut orphaned: RemovedComponents, + mut transforms: Query<(Option<&ChildOf>, &mut TransformTreeChanged)>, ) { - leaves - .par_iter_mut() - .for_each(|(transform, mut global_transform, child_of)| { - let Ok(parent_transform) = parents.get(child_of.parent) else { - return; - }; - if parent_transform.is_changed() - || transform.is_changed() - || global_transform.is_added() - { - *global_transform = parent_transform.mul_transform(*transform); + for entity in changed_transforms.iter().chain(orphaned.read()) { + let mut next = entity; + while let Ok((parent, mut tree)) = transforms.get_mut(next) { + if tree.is_changed() && !tree.is_added() { + // If the component was changed, this part of the tree has already been processed. + // Ignore this if the change was caused by the component being added. + break; } - }); + tree.set_changed(); + if let Some(parent) = parent.map(|p| p.parent) { + next = parent; + } else { + break; + }; + } + } } // TODO: This serial implementation isn't actually serial, it parallelizes across the roots. @@ -91,7 +95,7 @@ mod serial { /// /// Third party plugins should ensure that this is used in concert with /// [`sync_simple_transforms`](super::sync_simple_transforms) and - /// [`compute_transform_leaves`](super::compute_transform_leaves). + /// [`mark_dirty_trees`](super::mark_dirty_trees). pub fn propagate_parent_transforms( mut root_query: Query< (Entity, &Children, Ref, &mut GlobalTransform), @@ -100,7 +104,7 @@ mod serial { mut orphaned: RemovedComponents, transform_query: Query< (Ref, &mut GlobalTransform, Option<&Children>), - (With, With), + With, >, child_query: Query<(Entity, Ref), With>, mut orphaned_entities: Local>, @@ -168,7 +172,7 @@ mod serial { parent: &GlobalTransform, transform_query: &Query< (Ref, &mut GlobalTransform, Option<&Children>), - (With, With), + With, >, child_query: &Query<(Entity, Ref), With>, entity: Entity, @@ -245,12 +249,12 @@ mod serial { #[cfg(feature = "std")] mod parallel { use crate::prelude::*; + // TODO: this implementation could be used in no_std if there are equivalents of these. use alloc::{sync::Arc, vec::Vec}; use bevy_ecs::{entity::UniqueEntityIter, prelude::*, system::lifetimeless::Read}; use bevy_tasks::{ComputeTaskPool, TaskPool}; use bevy_utils::Parallel; use core::sync::atomic::{AtomicI32, Ordering}; - // TODO: this implementation could be used in no_std if there are equivalents of these. use std::sync::{ mpsc::{Receiver, Sender}, Mutex, @@ -261,32 +265,20 @@ mod parallel { /// /// Third party plugins should ensure that this is used in concert with /// [`sync_simple_transforms`](super::sync_simple_transforms) and - /// [`compute_transform_leaves`](super::compute_transform_leaves). + /// [`mark_dirty_trees`](super::mark_dirty_trees). pub fn propagate_parent_transforms( mut queue: Local, - mut orphaned: RemovedComponents, - mut orphans: Local>, mut roots: Query< (Entity, Ref, &mut GlobalTransform, &Children), - Without, + (Without, Changed), >, nodes: NodeQuery, ) { - // Orphans - orphans.clear(); - orphans.extend(orphaned.read()); - orphans.sort_unstable(); - // Process roots in parallel, seeding the work queue roots.par_iter_mut().for_each_init( || queue.local_queue.borrow_local_mut(), |outbox, (parent, transform, mut parent_transform, children)| { - if transform.is_changed() - || parent_transform.is_added() - || orphans.binary_search(&parent).is_ok() - { - *parent_transform = GlobalTransform::from(*transform); - } + *parent_transform = GlobalTransform::from(*transform); // SAFETY: the parent entities passed into this function are taken from iterating // over the root entity query. Queries iterate over disjoint entities, preventing @@ -315,6 +307,18 @@ mod parallel { // number of channel sends by avoiding sending partial batches. queue.send_batches(); + if let Ok(rx) = queue.receiver.try_lock() { + if let Some(task) = rx.try_iter().next() { + // This is a bit silly, but the only way to see if there is any work is to grab a + // task. Peeking will remove the task even if you don't call `next`, resulting in + // dropping a task. What we do here is grab the first task if there is one, then + // immediately send it to the back of the queue. + queue.sender.send(task).ok(); + } else { + return; // No work, don't bother spawning any tasks + } + } + // Spawn workers on the task pool to recursively propagate the hierarchy in parallel. let task_pool = ComputeTaskPool::get_or_init(TaskPool::default); task_pool.scope(|s| { @@ -373,12 +377,12 @@ mod parallel { // the hierarchy, guaranteeing unique access. #[expect(unsafe_code, reason = "Mutating disjoint entities in parallel")] unsafe { - let (_, (_, p_global_transform), (p_children, _)) = + let (_, (_, p_global_transform, _), (p_children, _)) = nodes.get_unchecked(parent).unwrap(); propagate_descendants_unchecked( parent, p_global_transform, - p_children, + p_children.unwrap(), // All entities in the queue should have children nodes, &mut outbox, queue, @@ -395,12 +399,8 @@ mod parallel { } } - /// Propagate transforms from `parent` to its non-leaf `children`, pushing updated child - /// entities to the `outbox`. Propagation does not visit leaf nodes; instead, they are computed - /// in [`compute_transform_leaves`](super::compute_transform_leaves), which can optimize much - /// more efficiently. - /// - /// This function will continue propagating transforms to descendants in a depth-first + /// Propagate transforms from `parent` to its `children`, pushing updated child entities to the + /// `outbox`. This function will continue propagating transforms to descendants in a depth-first /// traversal, while simultaneously pushing unvisited branches to the outbox, for other threads /// to take when idle. /// @@ -440,38 +440,29 @@ mod parallel { // visiting disjoint entities in parallel, which is safe. #[expect(unsafe_code, reason = "Mutating disjoint entities in parallel")] let children_iter = unsafe { - // Performance note: iter_many tests every child to see if it meets the query. For - // leaf nodes, this unfortunately means we have the pay the price of checking every - // child, even if it is a leaf node and is skipped. - // - // To ensure this is still the fastest design, I tried removing the second pass - // (`compute_transform_leaves`) and instead simply doing that here. However, that - // proved to be much slower than two pass for a few reasons: - // - it's less cache friendly and is outright slower than the tight loop in the - // second pass - // - it prevents parallelism, as all children must be iterated in series - // - // The only way I can see to make this faster when there are many leaf nodes is to - // speed up archetype checking to make the iterator skip leaf entities more quickly, - // or encoding the hierarchy level as a component. That, or use some kind of change - // detection to mark dirty subtrees when the transform is mutated. nodes.iter_many_unique_unsafe(UniqueEntityIter::from_iterator_unchecked( p_children.iter(), )) }; let mut last_child = None; - let new_children = children_iter.map( - |(child, (transform, mut global_transform), (children, child_of))| { - assert_eq!(child_of.parent, parent); - if p_global_transform.is_changed() - || transform.is_changed() - || global_transform.is_added() - { - *global_transform = p_global_transform.mul_transform(*transform); + let new_children = children_iter.filter_map( + |(child, (transform, mut global_transform, tree), (children, child_of))| { + if !tree.is_changed() && !p_global_transform.is_changed() { + // Static scene optimization + return None; } - last_child = Some((child, global_transform, children)); - child + assert_eq!(child_of.parent, parent); + + // Transform prop is expensive - this helps avoid updating entire subtrees if + // the GlobalTransform is unchanged, at the cost of an added equality check. + global_transform.set_if_neq(p_global_transform.mul_transform(*transform)); + + children.map(|children| { + // Only continue propagation if the entity has children. + last_child = Some((child, global_transform, children)); + child + }) }, ); outbox.extend(new_children); @@ -497,14 +488,18 @@ mod parallel { } /// Alias for a large, repeatedly used query. Queries for transform entities that have both a - /// parent and children, thus they are neither roots nor leaves. + /// parent and possibly children, thus they are not roots. type NodeQuery<'w, 's> = Query< 'w, 's, ( Entity, - (Ref<'static, Transform>, Mut<'static, GlobalTransform>), - (Read, Read), + ( + Ref<'static, Transform>, + Mut<'static, GlobalTransform>, + Ref<'static, TransformTreeChanged>, + ), + (Option>, Read), ), >; @@ -579,9 +574,9 @@ mod test { let mut schedule = Schedule::default(); schedule.add_systems( ( + mark_dirty_trees, sync_simple_transforms, propagate_parent_transforms, - compute_transform_leaves, ) .chain(), ); @@ -637,9 +632,9 @@ mod test { let mut schedule = Schedule::default(); schedule.add_systems( ( + mark_dirty_trees, sync_simple_transforms, propagate_parent_transforms, - compute_transform_leaves, ) .chain(), ); @@ -674,9 +669,9 @@ mod test { let mut schedule = Schedule::default(); schedule.add_systems( ( + mark_dirty_trees, sync_simple_transforms, propagate_parent_transforms, - compute_transform_leaves, ) .chain(), ); @@ -713,9 +708,9 @@ mod test { let mut schedule = Schedule::default(); schedule.add_systems( ( + mark_dirty_trees, sync_simple_transforms, propagate_parent_transforms, - compute_transform_leaves, ) .chain(), ); @@ -793,9 +788,9 @@ mod test { app.add_systems( Update, ( + mark_dirty_trees, sync_simple_transforms, propagate_parent_transforms, - compute_transform_leaves, ) .chain(), ); @@ -847,12 +842,10 @@ mod test { app.add_systems( Update, - ( - propagate_parent_transforms, - sync_simple_transforms, - compute_transform_leaves, - ) - .chain(), + // It is unsound for this unsafe system to encounter a cycle without panicking. This + // requirement only applies to systems with unsafe parallel traversal that result in + // aliased mutability during a cycle. + propagate_parent_transforms, ); fn setup_world(world: &mut World) -> (Entity, Entity) { @@ -912,11 +905,14 @@ mod test { // Create transform propagation schedule let mut schedule = Schedule::default(); - schedule.add_systems(( - sync_simple_transforms, - propagate_parent_transforms, - compute_transform_leaves, - )); + schedule.add_systems( + ( + mark_dirty_trees, + propagate_parent_transforms, + sync_simple_transforms, + ) + .chain(), + ); // Spawn a `Transform` entity with a local translation of `Vec3::ONE` let mut spawn_transform_bundle = diff --git a/crates/bevy_ui/src/layout/mod.rs b/crates/bevy_ui/src/layout/mod.rs index 1b74b90f33..1aba1e6027 100644 --- a/crates/bevy_ui/src/layout/mod.rs +++ b/crates/bevy_ui/src/layout/mod.rs @@ -353,9 +353,10 @@ mod tests { use bevy_math::{Rect, UVec2, Vec2}; use bevy_platform_support::collections::HashMap; use bevy_render::{camera::ManualTextureViews, prelude::Camera}; + use bevy_transform::systems::mark_dirty_trees; use bevy_transform::{ prelude::GlobalTransform, - systems::{compute_transform_leaves, propagate_parent_transforms, sync_simple_transforms}, + systems::{propagate_parent_transforms, sync_simple_transforms}, }; use bevy_utils::prelude::default; use bevy_window::{ @@ -408,9 +409,9 @@ mod tests { update_ui_context_system, ApplyDeferred, ui_layout_system, + mark_dirty_trees, sync_simple_transforms, propagate_parent_transforms, - compute_transform_leaves, ) .chain(), );