Multithreaded render command encoding (#9172)

# Objective - Encoding many GPU commands (such as in a renderpass with many draws, such as the main opaque pass) onto a `wgpu::CommandEncoder` is very expensive, and takes a long time. - To improve performance, we want to perform the command encoding for these heavy passes in parallel. ## Solution - `RenderContext` can now queue up "command buffer generation tasks" which are closures that will generate a command buffer when called. - When finalizing the render context to produce the final list of command buffers, these tasks are run in parallel on the `ComputeTaskPool` to produce their corresponding command buffers. - The general idea is that the node graph will run in serial, but in a node, instead of doing rendering work, you can add tasks to do render work in parallel with other node's tasks that get ran at the end of the graph execution. ## Nodes Parallelized - `MainOpaquePass3dNode` - `PrepassNode` - `DeferredGBufferPrepassNode` - `ShadowPassNode` (One task per view) ## Future Work - For large number of draws calls, might be worth further subdividing passes into 2+ tasks. - Extend this to UI, 2d, transparent, and transmissive nodes? - Needs testing - small command buffers are inefficient - it may be worth reverting to the serial command encoder usage for render phases with few items. - All "serial" (traditional) rendering work must finish before parallel rendering tasks (the new stuff) can start to run. - There is still only one submission to the graphics queue at the end of the graph execution. There is still no ability to submit work earlier. ## Performance Improvement Thanks to @Elabajaba for testing on Bistro. ![image](https://github.com/bevyengine/bevy/assets/47158642/be50dafa-85eb-4da5-a5cd-c0a044f1e76f) TLDR: Without shadow mapping, this PR has no impact. _With_ shadow mapping, this PR gives **~40 more fps** than main. --- ## Changelog - `MainOpaquePass3dNode`, `PrepassNode`, `DeferredGBufferPrepassNode`, and each shadow map within `ShadowPassNode` are now encoded in parallel, giving _greatly_ increased CPU performance, mainly when shadow mapping is enabled. - Does not work on WASM or AMD+Windows+Vulkan. - Added `RenderContext::add_command_buffer_generation_task()`. - `RenderContext::new()` now takes adapter info - Some render graph and Node related types and methods now have additional lifetime constraints. ## Migration Guide `RenderContext::new()` now takes adapter info - Some render graph and Node related types and methods now have additional lifetime constraints. --------- Co-authored-by: Elabajaba <Elabajaba@users.noreply.github.com> Co-authored-by: François <mockersf@gmail.com>
2024-02-08 23:35:35 -08:00 · 2024-02-08 23:35:35 -08:00 · f4dab8a4e8
commit f4dab8a4e8
parent 5313730534
7 changed files with 262 additions and 139 deletions
--- a/crates/bevy_core_pipeline/src/core_3d/main_opaque_pass_3d_node.rs
+++ b/crates/bevy_core_pipeline/src/core_3d/main_opaque_pass_3d_node.rs
@ -6,8 +6,8 @@ use bevy_ecs::{prelude::World, query::QueryItem};
 use bevy_render::{
    camera::ExtractedCamera,
    render_graph::{NodeRunError, RenderGraphContext, ViewNode},
-    render_phase::RenderPhase,
-    render_resource::{PipelineCache, RenderPassDescriptor, StoreOp},
+    render_phase::{RenderPhase, TrackedRenderPass},
+    render_resource::{CommandEncoderDescriptor, PipelineCache, RenderPassDescriptor, StoreOp},
    renderer::RenderContext,
    view::{ViewDepthTexture, ViewTarget, ViewUniformOffset},
 };
@ -31,10 +31,10 @@ impl ViewNode for MainOpaquePass3dNode {
        &'static ViewUniformOffset,
    );

-    fn run(
+    fn run<'w>(
        &self,
        graph: &mut RenderGraphContext,
-        render_context: &mut RenderContext,
+        render_context: &mut RenderContext<'w>,
        (
            camera,
            opaque_phase,
@ -44,52 +44,69 @@ impl ViewNode for MainOpaquePass3dNode {
            skybox_pipeline,
            skybox_bind_group,
            view_uniform_offset,
-        ): QueryItem<Self::ViewQuery>,
-        world: &World,
+        ): QueryItem<'w, Self::ViewQuery>,
+        world: &'w World,
    ) -> Result<(), NodeRunError> {
-        // Run the opaque pass, sorted by pipeline key and mesh id to greatly improve batching.
-        // NOTE: Scoped to drop the mutable borrow of render_context
-        #[cfg(feature = "trace")]
-        let _main_opaque_pass_3d_span = info_span!("main_opaque_pass_3d").entered();
-
-        // Setup render pass
-        let mut render_pass = render_context.begin_tracked_render_pass(RenderPassDescriptor {
-            label: Some("main_opaque_pass_3d"),
-            color_attachments: &[Some(target.get_color_attachment())],
-            depth_stencil_attachment: Some(depth.get_attachment(StoreOp::Store)),
-            timestamp_writes: None,
-            occlusion_query_set: None,
-        });
-
-        if let Some(viewport) = camera.viewport.as_ref() {
-            render_pass.set_camera_viewport(viewport);
-        }
+        let color_attachments = [Some(target.get_color_attachment())];
+        let depth_stencil_attachment = Some(depth.get_attachment(StoreOp::Store));

        let view_entity = graph.view_entity();
+        render_context.add_command_buffer_generation_task(move |render_device| {
+            #[cfg(feature = "trace")]
+            let _main_opaque_pass_3d_span = info_span!("main_opaque_pass_3d").entered();

-        // Opaque draws
-        opaque_phase.render(&mut render_pass, world, view_entity);
+            // Command encoder setup
+            let mut command_encoder =
+                render_device.create_command_encoder(&CommandEncoderDescriptor {
+                    label: Some("main_opaque_pass_3d_command_encoder"),
+                });

-        // Alpha draws
-        if !alpha_mask_phase.items.is_empty() {
-            alpha_mask_phase.render(&mut render_pass, world, view_entity);
-        }
-
-        // Draw the skybox using a fullscreen triangle
-        if let (Some(skybox_pipeline), Some(SkyboxBindGroup(skybox_bind_group))) =
-            (skybox_pipeline, skybox_bind_group)
-        {
-            let pipeline_cache = world.resource::<PipelineCache>();
-            if let Some(pipeline) = pipeline_cache.get_render_pipeline(skybox_pipeline.0) {
-                render_pass.set_render_pipeline(pipeline);
-                render_pass.set_bind_group(
-                    0,
-                    &skybox_bind_group.0,
-                    &[view_uniform_offset.offset, skybox_bind_group.1],
-                );
-                render_pass.draw(0..3, 0..1);
+            // Render pass setup
+            let render_pass = command_encoder.begin_render_pass(&RenderPassDescriptor {
+                label: Some("main_opaque_pass_3d"),
+                color_attachments: &color_attachments,
+                depth_stencil_attachment,
+                timestamp_writes: None,
+                occlusion_query_set: None,
+            });
+            let mut render_pass = TrackedRenderPass::new(&render_device, render_pass);
+            if let Some(viewport) = camera.viewport.as_ref() {
+                render_pass.set_camera_viewport(viewport);
            }
-        }
+
+            // Opaque draws
+            if !opaque_phase.items.is_empty() {
+                #[cfg(feature = "trace")]
+                let _opaque_main_pass_3d_span = info_span!("opaque_main_pass_3d").entered();
+                opaque_phase.render(&mut render_pass, world, view_entity);
+            }
+
+            // Alpha draws
+            if !alpha_mask_phase.items.is_empty() {
+                #[cfg(feature = "trace")]
+                let _alpha_mask_main_pass_3d_span = info_span!("alpha_mask_main_pass_3d").entered();
+                alpha_mask_phase.render(&mut render_pass, world, view_entity);
+            }
+
+            // Skybox draw using a fullscreen triangle
+            if let (Some(skybox_pipeline), Some(SkyboxBindGroup(skybox_bind_group))) =
+                (skybox_pipeline, skybox_bind_group)
+            {
+                let pipeline_cache = world.resource::<PipelineCache>();
+                if let Some(pipeline) = pipeline_cache.get_render_pipeline(skybox_pipeline.0) {
+                    render_pass.set_render_pipeline(pipeline);
+                    render_pass.set_bind_group(
+                        0,
+                        &skybox_bind_group.0,
+                        &[view_uniform_offset.offset, skybox_bind_group.1],
+                    );
+                    render_pass.draw(0..3, 0..1);
+                }
+            }
+
+            drop(render_pass);
+            command_encoder.finish()
+        });

        Ok(())
    }
--- a/crates/bevy_core_pipeline/src/deferred/node.rs
+++ b/crates/bevy_core_pipeline/src/deferred/node.rs
@ -2,7 +2,8 @@ use bevy_ecs::prelude::*;
 use bevy_ecs::query::QueryItem;
 use bevy_render::render_graph::ViewNode;

-use bevy_render::render_resource::StoreOp;
+use bevy_render::render_phase::TrackedRenderPass;
+use bevy_render::render_resource::{CommandEncoderDescriptor, StoreOp};
 use bevy_render::{
    camera::ExtractedCamera,
    render_graph::{NodeRunError, RenderGraphContext},
@ -33,21 +34,19 @@ impl ViewNode for DeferredGBufferPrepassNode {
        &'static ViewPrepassTextures,
    );

-    fn run(
+    fn run<'w>(
        &self,
        graph: &mut RenderGraphContext,
-        render_context: &mut RenderContext,
+        render_context: &mut RenderContext<'w>,
        (
            camera,
            opaque_deferred_phase,
            alpha_mask_deferred_phase,
            view_depth_texture,
            view_prepass_textures,
-        ): QueryItem<Self::ViewQuery>,
-        world: &World,
+        ): QueryItem<'w, Self::ViewQuery>,
+        world: &'w World,
    ) -> Result<(), NodeRunError> {
-        let view_entity = graph.view_entity();
-
        let mut color_attachments = vec![];
        color_attachments.push(
            view_prepass_textures
@ -107,49 +106,64 @@ impl ViewNode for DeferredGBufferPrepassNode {
                .map(|deferred_lighting_pass_id| deferred_lighting_pass_id.get_attachment()),
        );

+        // If all color attachments are none: clear the color attachment list so that no fragment shader is required
        if color_attachments.iter().all(Option::is_none) {
-            // All attachments are none: clear the attachment list so that no fragment shader is required.
            color_attachments.clear();
        }

-        {
-            // Set up the pass descriptor with the depth attachment and optional color attachments.
-            let mut render_pass = render_context.begin_tracked_render_pass(RenderPassDescriptor {
+        let depth_stencil_attachment = Some(view_depth_texture.get_attachment(StoreOp::Store));
+
+        let view_entity = graph.view_entity();
+        render_context.add_command_buffer_generation_task(move |render_device| {
+            #[cfg(feature = "trace")]
+            let _deferred_span = info_span!("deferred").entered();
+
+            // Command encoder setup
+            let mut command_encoder =
+                render_device.create_command_encoder(&CommandEncoderDescriptor {
+                    label: Some("deferred_command_encoder"),
+                });
+
+            // Render pass setup
+            let render_pass = command_encoder.begin_render_pass(&RenderPassDescriptor {
                label: Some("deferred"),
                color_attachments: &color_attachments,
-                depth_stencil_attachment: Some(view_depth_texture.get_attachment(StoreOp::Store)),
+                depth_stencil_attachment,
                timestamp_writes: None,
                occlusion_query_set: None,
            });
-
+            let mut render_pass = TrackedRenderPass::new(&render_device, render_pass);
            if let Some(viewport) = camera.viewport.as_ref() {
                render_pass.set_camera_viewport(viewport);
            }

-            // Always run deferred pass to ensure the deferred gbuffer and deferred_lighting_pass_id are cleared.
-            {
-                // Run the prepass, sorted front-to-back.
+            // Opaque draws
+            if !opaque_deferred_phase.items.is_empty() {
                #[cfg(feature = "trace")]
                let _opaque_prepass_span = info_span!("opaque_deferred").entered();
                opaque_deferred_phase.render(&mut render_pass, world, view_entity);
            }

+            // Alpha masked draws
            if !alpha_mask_deferred_phase.items.is_empty() {
-                // Run the deferred, sorted front-to-back.
                #[cfg(feature = "trace")]
                let _alpha_mask_deferred_span = info_span!("alpha_mask_deferred").entered();
                alpha_mask_deferred_phase.render(&mut render_pass, world, view_entity);
            }
-        }

-        if let Some(prepass_depth_texture) = &view_prepass_textures.depth {
-            // Copy depth buffer to texture.
-            render_context.command_encoder().copy_texture_to_texture(
-                view_depth_texture.texture.as_image_copy(),
-                prepass_depth_texture.texture.texture.as_image_copy(),
-                view_prepass_textures.size,
-            );
-        }
+            drop(render_pass);
+
+            // Copy prepass depth to the main depth texture
+            if let Some(prepass_depth_texture) = &view_prepass_textures.depth {
+                command_encoder.copy_texture_to_texture(
+                    view_depth_texture.texture.as_image_copy(),
+                    prepass_depth_texture.texture.texture.as_image_copy(),
+                    view_prepass_textures.size,
+                );
+            }
+
+            command_encoder.finish()
+        });

        Ok(())
    }
--- a/crates/bevy_core_pipeline/src/prepass/node.rs
+++ b/crates/bevy_core_pipeline/src/prepass/node.rs
@ -1,12 +1,10 @@
 use bevy_ecs::prelude::*;
 use bevy_ecs::query::QueryItem;
-use bevy_render::render_graph::ViewNode;
-use bevy_render::render_resource::StoreOp;
 use bevy_render::{
    camera::ExtractedCamera,
-    render_graph::{NodeRunError, RenderGraphContext},
-    render_phase::RenderPhase,
-    render_resource::RenderPassDescriptor,
+    render_graph::{NodeRunError, RenderGraphContext, ViewNode},
+    render_phase::{RenderPhase, TrackedRenderPass},
+    render_resource::{CommandEncoderDescriptor, RenderPassDescriptor, StoreOp},
    renderer::RenderContext,
    view::ViewDepthTexture,
 };
@ -31,10 +29,10 @@ impl ViewNode for PrepassNode {
        Option<&'static DeferredPrepass>,
    );

-    fn run(
+    fn run<'w>(
        &self,
        graph: &mut RenderGraphContext,
-        render_context: &mut RenderContext,
+        render_context: &mut RenderContext<'w>,
        (
            camera,
            opaque_prepass_phase,
@ -42,11 +40,9 @@ impl ViewNode for PrepassNode {
            view_depth_texture,
            view_prepass_textures,
            deferred_prepass,
-        ): QueryItem<Self::ViewQuery>,
-        world: &World,
+        ): QueryItem<'w, Self::ViewQuery>,
+        world: &'w World,
    ) -> Result<(), NodeRunError> {
-        let view_entity = graph.view_entity();
-
        let mut color_attachments = vec![
            view_prepass_textures
                .normal
@ -56,55 +52,72 @@ impl ViewNode for PrepassNode {
                .motion_vectors
                .as_ref()
                .map(|motion_vectors_texture| motion_vectors_texture.get_attachment()),
-            // Use None in place of Deferred attachments
+            // Use None in place of deferred attachments
            None,
            None,
        ];

+        // If all color attachments are none: clear the color attachment list so that no fragment shader is required
        if color_attachments.iter().all(Option::is_none) {
-            // all attachments are none: clear the attachment list so that no fragment shader is required
            color_attachments.clear();
        }

-        {
-            // Set up the pass descriptor with the depth attachment and optional color attachments
-            let mut render_pass = render_context.begin_tracked_render_pass(RenderPassDescriptor {
+        let depth_stencil_attachment = Some(view_depth_texture.get_attachment(StoreOp::Store));
+
+        let view_entity = graph.view_entity();
+        render_context.add_command_buffer_generation_task(move |render_device| {
+            #[cfg(feature = "trace")]
+            let _prepass_span = info_span!("prepass").entered();
+
+            // Command encoder setup
+            let mut command_encoder =
+                render_device.create_command_encoder(&CommandEncoderDescriptor {
+                    label: Some("prepass_command_encoder"),
+                });
+
+            // Render pass setup
+            let render_pass = command_encoder.begin_render_pass(&RenderPassDescriptor {
                label: Some("prepass"),
                color_attachments: &color_attachments,
-                depth_stencil_attachment: Some(view_depth_texture.get_attachment(StoreOp::Store)),
+                depth_stencil_attachment,
                timestamp_writes: None,
                occlusion_query_set: None,
            });
+            let mut render_pass = TrackedRenderPass::new(&render_device, render_pass);
            if let Some(viewport) = camera.viewport.as_ref() {
                render_pass.set_camera_viewport(viewport);
            }

-            // Always run opaque pass to ensure screen is cleared
-            {
-                // Run the prepass, sorted front-to-back
+            // Opaque draws
+            if !opaque_prepass_phase.items.is_empty() {
                #[cfg(feature = "trace")]
                let _opaque_prepass_span = info_span!("opaque_prepass").entered();
                opaque_prepass_phase.render(&mut render_pass, world, view_entity);
            }

+            // Alpha masked draws
            if !alpha_mask_prepass_phase.items.is_empty() {
-                // Run the prepass, sorted front-to-back
                #[cfg(feature = "trace")]
                let _alpha_mask_prepass_span = info_span!("alpha_mask_prepass").entered();
                alpha_mask_prepass_phase.render(&mut render_pass, world, view_entity);
            }
-        }
-        if deferred_prepass.is_none() {
-            // Copy if deferred isn't going to
-            if let Some(prepass_depth_texture) = &view_prepass_textures.depth {
-                // Copy depth buffer to texture
-                render_context.command_encoder().copy_texture_to_texture(
-                    view_depth_texture.texture.as_image_copy(),
-                    prepass_depth_texture.texture.texture.as_image_copy(),
-                    view_prepass_textures.size,
-                );
+
+            drop(render_pass);
+
+            // Copy prepass depth to the main depth texture if deferred isn't going to
+            if deferred_prepass.is_none() {
+                if let Some(prepass_depth_texture) = &view_prepass_textures.depth {
+                    command_encoder.copy_texture_to_texture(
+                        view_depth_texture.texture.as_image_copy(),
+                        prepass_depth_texture.texture.texture.as_image_copy(),
+                        view_prepass_textures.size,
+                    );
+                }
            }
-        }
+
+            command_encoder.finish()
+        });
+
        Ok(())
    }
 }
--- a/crates/bevy_pbr/src/render/light.rs
+++ b/crates/bevy_pbr/src/render/light.rs
@ -16,6 +16,8 @@ use bevy_render::{
    Extract,
 };
 use bevy_transform::{components::GlobalTransform, prelude::Transform};
+#[cfg(feature = "trace")]
+use bevy_utils::tracing::info_span;
 use bevy_utils::{
    nonmax::NonMaxU32,
    tracing::{error, warn},
@ -1780,11 +1782,11 @@ impl Node for ShadowPassNode {
        self.view_light_query.update_archetypes(world);
    }

-    fn run(
+    fn run<'w>(
        &self,
        graph: &mut RenderGraphContext,
-        render_context: &mut RenderContext,
-        world: &World,
+        render_context: &mut RenderContext<'w>,
+        world: &'w World,
    ) -> Result<(), NodeRunError> {
        let view_entity = graph.view_entity();
        if let Ok(view_lights) = self.main_view_query.get_manual(world, view_entity) {
@ -1794,22 +1796,32 @@ impl Node for ShadowPassNode {
                    .get_manual(world, view_light_entity)
                    .unwrap();

-                if shadow_phase.items.is_empty() {
-                    continue;
-                }
+                let depth_stencil_attachment =
+                    Some(view_light.depth_attachment.get_attachment(StoreOp::Store));

-                let mut render_pass =
-                    render_context.begin_tracked_render_pass(RenderPassDescriptor {
+                render_context.add_command_buffer_generation_task(move |render_device| {
+                    #[cfg(feature = "trace")]
+                    let _shadow_pass_span = info_span!("shadow_pass").entered();
+
+                    let mut command_encoder =
+                        render_device.create_command_encoder(&CommandEncoderDescriptor {
+                            label: Some("shadow_pass_command_encoder"),
+                        });
+
+                    let render_pass = command_encoder.begin_render_pass(&RenderPassDescriptor {
                        label: Some(&view_light.pass_name),
                        color_attachments: &[],
-                        depth_stencil_attachment: Some(
-                            view_light.depth_attachment.get_attachment(StoreOp::Store),
-                        ),
+                        depth_stencil_attachment,
                        timestamp_writes: None,
                        occlusion_query_set: None,
                    });
+                    let mut render_pass = TrackedRenderPass::new(&render_device, render_pass);

-                shadow_phase.render(&mut render_pass, world, view_light_entity);
+                    shadow_phase.render(&mut render_pass, world, view_light_entity);
+
+                    drop(render_pass);
+                    command_encoder.finish()
+                });
            }
        }

--- a/crates/bevy_render/src/render_graph/node.rs
+++ b/crates/bevy_render/src/render_graph/node.rs
@ -77,11 +77,11 @@ pub trait Node: Downcast + Send + Sync + 'static {
    /// Runs the graph node logic, issues draw calls, updates the output slots and
    /// optionally queues up subgraphs for execution. The graph data, input and output values are
    /// passed via the [`RenderGraphContext`].
-    fn run(
+    fn run<'w>(
        &self,
        graph: &mut RenderGraphContext,
-        render_context: &mut RenderContext,
-        world: &World,
+        render_context: &mut RenderContext<'w>,
+        world: &'w World,
    ) -> Result<(), NodeRunError>;
 }

@ -346,12 +346,12 @@ pub trait ViewNode {
    /// Runs the graph node logic, issues draw calls, updates the output slots and
    /// optionally queues up subgraphs for execution. The graph data, input and output values are
    /// passed via the [`RenderGraphContext`].
-    fn run(
+    fn run<'w>(
        &self,
        graph: &mut RenderGraphContext,
-        render_context: &mut RenderContext,
-        view_query: QueryItem<Self::ViewQuery>,
-        world: &World,
+        render_context: &mut RenderContext<'w>,
+        view_query: QueryItem<'w, Self::ViewQuery>,
+        world: &'w World,
    ) -> Result<(), NodeRunError>;
 }

@ -388,11 +388,11 @@ where
        self.node.update(world);
    }

-    fn run(
+    fn run<'w>(
        &self,
        graph: &mut RenderGraphContext,
-        render_context: &mut RenderContext,
-        world: &World,
+        render_context: &mut RenderContext<'w>,
+        world: &'w World,
    ) -> Result<(), NodeRunError> {
        let Ok(view) = self.view_query.get_manual(world, graph.view_entity()) else {
            return Ok(());
--- a/crates/bevy_render/src/renderer/graph_runner.rs
+++ b/crates/bevy_render/src/renderer/graph_runner.rs
@ -57,10 +57,11 @@ impl RenderGraphRunner {
        graph: &RenderGraph,
        render_device: RenderDevice,
        queue: &wgpu::Queue,
+        adapter: &wgpu::Adapter,
        world: &World,
        finalizer: impl FnOnce(&mut wgpu::CommandEncoder),
    ) -> Result<(), RenderGraphRunnerError> {
-        let mut render_context = RenderContext::new(render_device);
+        let mut render_context = RenderContext::new(render_device, adapter.get_info());
        Self::run_graph(graph, None, &mut render_context, world, &[], None)?;
        finalizer(render_context.command_encoder());

@ -72,11 +73,11 @@ impl RenderGraphRunner {
        Ok(())
    }

-    fn run_graph(
+    fn run_graph<'w>(
        graph: &RenderGraph,
        sub_graph: Option<InternedRenderSubGraph>,
-        render_context: &mut RenderContext,
-        world: &World,
+        render_context: &mut RenderContext<'w>,
+        world: &'w World,
        inputs: &[SlotValue],
        view_entity: Option<Entity>,
    ) -> Result<(), RenderGraphRunnerError> {
--- a/crates/bevy_render/src/renderer/mod.rs
+++ b/crates/bevy_render/src/renderer/mod.rs
@ -2,6 +2,7 @@ mod graph_runner;
 mod render_device;

 use bevy_derive::{Deref, DerefMut};
+use bevy_tasks::ComputeTaskPool;
 use bevy_utils::tracing::{error, info, info_span};
 pub use graph_runner::*;
 pub use render_device::*;
@ -29,11 +30,13 @@ pub fn render_system(world: &mut World, state: &mut SystemState<Query<Entity, Wi
    let graph = world.resource::<RenderGraph>();
    let render_device = world.resource::<RenderDevice>();
    let render_queue = world.resource::<RenderQueue>();
+    let render_adapter = world.resource::<RenderAdapter>();

    if let Err(e) = RenderGraphRunner::run(
        graph,
        render_device.clone(), // TODO: is this clone really necessary?
        &render_queue.0,
+        &render_adapter.0,
        world,
        |encoder| {
            crate::view::screenshot::submit_screenshot_commands(world, encoder);
@ -298,19 +301,31 @@ pub async fn initialize_renderer(
 ///
 /// The [`RenderDevice`] is used to create render resources and the
 /// the [`CommandEncoder`] is used to record a series of GPU operations.
-pub struct RenderContext {
+pub struct RenderContext<'w> {
    render_device: RenderDevice,
    command_encoder: Option<CommandEncoder>,
-    command_buffers: Vec<CommandBuffer>,
+    command_buffer_queue: Vec<QueuedCommandBuffer<'w>>,
+    force_serial: bool,
 }

-impl RenderContext {
+impl<'w> RenderContext<'w> {
    /// Creates a new [`RenderContext`] from a [`RenderDevice`].
-    pub fn new(render_device: RenderDevice) -> Self {
+    pub fn new(render_device: RenderDevice, adapter_info: AdapterInfo) -> Self {
+        // HACK: Parallel command encoding is currently bugged on AMD + Windows + Vulkan with wgpu 0.19.1
+        #[cfg(target_os = "windows")]
+        let force_serial =
+            adapter_info.driver.contains("AMD") && adapter_info.backend == wgpu::Backend::Vulkan;
+        #[cfg(not(target_os = "windows"))]
+        let force_serial = {
+            drop(adapter_info);
+            false
+        };
+
        Self {
            render_device,
            command_encoder: None,
-            command_buffers: Vec::new(),
+            command_buffer_queue: Vec::new(),
+            force_serial,
        }
    }

@ -342,25 +357,76 @@ impl RenderContext {
        TrackedRenderPass::new(&self.render_device, render_pass)
    }

-    /// Append a [`CommandBuffer`] to the queue.
+    /// Append a [`CommandBuffer`] to the command buffer queue.
    ///
    /// If present, this will flush the currently unflushed [`CommandEncoder`]
-    /// into a [`CommandBuffer`] into the queue before append the provided
+    /// into a [`CommandBuffer`] into the queue before appending the provided
    /// buffer.
    pub fn add_command_buffer(&mut self, command_buffer: CommandBuffer) {
        self.flush_encoder();
-        self.command_buffers.push(command_buffer);
+
+        self.command_buffer_queue
+            .push(QueuedCommandBuffer::Ready(command_buffer));
    }

-    /// Finalizes the queue and returns the queue of [`CommandBuffer`]s.
+    /// Append a function that will generate a [`CommandBuffer`] to the
+    /// command buffer queue, to be ran later.
+    ///
+    /// If present, this will flush the currently unflushed [`CommandEncoder`]
+    /// into a [`CommandBuffer`] into the queue before appending the provided
+    /// buffer.
+    pub fn add_command_buffer_generation_task(
+        &mut self,
+        task: impl FnOnce(RenderDevice) -> CommandBuffer + 'w + Send,
+    ) {
+        self.flush_encoder();
+
+        self.command_buffer_queue
+            .push(QueuedCommandBuffer::Task(Box::new(task)));
+    }
+
+    /// Finalizes and returns the queue of [`CommandBuffer`]s.
+    ///
+    /// This function will wait until all command buffer generation tasks are complete
+    /// by running them in parallel (where supported).
    pub fn finish(mut self) -> Vec<CommandBuffer> {
        self.flush_encoder();
-        self.command_buffers
+
+        let mut command_buffers = Vec::with_capacity(self.command_buffer_queue.len());
+        let mut task_based_command_buffers = ComputeTaskPool::get().scope(|task_pool| {
+            for (i, queued_command_buffer) in self.command_buffer_queue.into_iter().enumerate() {
+                match queued_command_buffer {
+                    QueuedCommandBuffer::Ready(command_buffer) => {
+                        command_buffers.push((i, command_buffer));
+                    }
+                    QueuedCommandBuffer::Task(command_buffer_generation_task) => {
+                        let render_device = self.render_device.clone();
+                        if self.force_serial {
+                            command_buffers
+                                .push((i, command_buffer_generation_task(render_device)));
+                        } else {
+                            task_pool.spawn(async move {
+                                (i, command_buffer_generation_task(render_device))
+                            });
+                        }
+                    }
+                }
+            }
+        });
+        command_buffers.append(&mut task_based_command_buffers);
+        command_buffers.sort_unstable_by_key(|(i, _)| *i);
+        command_buffers.into_iter().map(|(_, cb)| cb).collect()
    }

    fn flush_encoder(&mut self) {
        if let Some(encoder) = self.command_encoder.take() {
-            self.command_buffers.push(encoder.finish());
+            self.command_buffer_queue
+                .push(QueuedCommandBuffer::Ready(encoder.finish()));
        }
    }
 }
+
+enum QueuedCommandBuffer<'w> {
+    Ready(CommandBuffer),
+    Task(Box<dyn FnOnce(RenderDevice) -> CommandBuffer + 'w + Send>),
+}