Parallelize extract_meshes (#9966)

# Objective
`extract_meshes` can easily be one of the most expensive operations in
the blocking extract schedule for 3D apps. It also has no fundamentally
serialized parts and can easily be run across multiple threads. Let's
speed it up by parallelizing it!

## Solution
Use the `ThreadLocal<Cell<Vec<T>>>` approach utilized by #7348 in
conjunction with `Query::par_iter` to build a set of thread-local
queues, and collect them after going wide.

## Performance
Using `cargo run --profile stress-test --features trace_tracy --example
many_cubes`. Yellow is this PR. Red is main.

`extract_meshes`:


![image](https://github.com/bevyengine/bevy/assets/3137680/9d45aa2e-3cfa-4fad-9c08-53498b51a73b)

An average reduction from 1.2ms to 770us is seen, a 41.6% improvement.

Note: this is still not including #9950's changes, so this may actually
result in even faster speedups once that's merged in.
This commit is contained in:
James Liu 2023-10-01 02:44:03 -07:00 committed by GitHub
parent 1d7577fc42
commit a1a81e5721
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 52 additions and 40 deletions

View file

@ -33,3 +33,4 @@ bytemuck = { version = "1", features = ["derive"] }
naga_oil = "0.8"
radsort = "0.1"
smallvec = "1.6"
thread_local = "1.0"

View file

@ -45,6 +45,8 @@ use bevy_render::{
};
use bevy_transform::components::GlobalTransform;
use bevy_utils::{tracing::error, EntityHashMap, HashMap, Hashed};
use std::cell::Cell;
use thread_local::ThreadLocal;
use crate::render::{
morph::{
@ -246,6 +248,7 @@ pub fn extract_meshes(
mut commands: Commands,
mut previous_len: Local<usize>,
mut render_mesh_instances: ResMut<RenderMeshInstances>,
mut thread_local_queues: Local<ThreadLocal<Cell<Vec<(Entity, RenderMeshInstance)>>>>,
meshes_query: Extract<
Query<(
Entity,
@ -259,50 +262,58 @@ pub fn extract_meshes(
)>,
>,
) {
meshes_query.par_iter().for_each(
|(
entity,
view_visibility,
transform,
previous_transform,
handle,
not_receiver,
not_caster,
no_automatic_batching,
)| {
if !view_visibility.get() {
return;
}
let transform = transform.affine();
let previous_transform = previous_transform.map(|t| t.0).unwrap_or(transform);
let mut flags = if not_receiver.is_some() {
MeshFlags::empty()
} else {
MeshFlags::SHADOW_RECEIVER
};
if transform.matrix3.determinant().is_sign_positive() {
flags |= MeshFlags::SIGN_DETERMINANT_MODEL_3X3;
}
let transforms = MeshTransforms {
transform: (&transform).into(),
previous_transform: (&previous_transform).into(),
flags: flags.bits(),
};
let tls = thread_local_queues.get_or_default();
let mut queue = tls.take();
queue.push((
entity,
RenderMeshInstance {
mesh_asset_id: handle.id(),
transforms,
shadow_caster: not_caster.is_none(),
material_bind_group_id: MaterialBindGroupId::default(),
automatic_batching: !no_automatic_batching,
},
));
tls.set(queue);
},
);
render_mesh_instances.clear();
let mut entities = Vec::with_capacity(*previous_len);
let visible_meshes = meshes_query.iter().filter(|(_, vis, ..)| vis.get());
for (
entity,
_,
transform,
previous_transform,
handle,
not_receiver,
not_caster,
no_automatic_batching,
) in visible_meshes
{
let transform = transform.affine();
let previous_transform = previous_transform.map(|t| t.0).unwrap_or(transform);
let mut flags = if not_receiver.is_some() {
MeshFlags::empty()
} else {
MeshFlags::SHADOW_RECEIVER
};
if transform.matrix3.determinant().is_sign_positive() {
flags |= MeshFlags::SIGN_DETERMINANT_MODEL_3X3;
}
let transforms = MeshTransforms {
transform: (&transform).into(),
previous_transform: (&previous_transform).into(),
flags: flags.bits(),
};
for queue in thread_local_queues.iter_mut() {
// FIXME: Remove this - it is just a workaround to enable rendering to work as
// render commands require an entity to exist at the moment.
entities.push((entity, Mesh3d));
render_mesh_instances.insert(
entity,
RenderMeshInstance {
mesh_asset_id: handle.id(),
transforms,
shadow_caster: not_caster.is_none(),
material_bind_group_id: MaterialBindGroupId::default(),
automatic_batching: !no_automatic_batching,
},
);
entities.extend(queue.get_mut().iter().map(|(e, _)| (*e, Mesh3d)));
render_mesh_instances.extend(queue.get_mut().drain(..));
}
*previous_len = entities.len();
commands.insert_or_spawn_batch(entities);