bevy/crates/bevy_pbr/src/render/skin.rs

use core::mem::{self, size_of};

use bevy_asset::Assets;
use bevy_ecs::{entity::EntityHashMap, prelude::*};
use bevy_math::Mat4;
use bevy_render::{
    batching::NoAutomaticBatching,
    mesh::skinning::{SkinnedMesh, SkinnedMeshInverseBindposes},
    render_resource::{BufferUsages, RawBufferVec},
    renderer::{RenderDevice, RenderQueue},
    view::ViewVisibility,
    Extract,
};
use bevy_transform::prelude::GlobalTransform;

/// Maximum number of joints supported for skinned meshes.
pub const MAX_JOINTS: usize = 256;

#[derive(Component)]
pub struct SkinIndex {
    pub index: u32,
}

impl SkinIndex {
    /// Index to be in address space based on the size of a skin uniform.
    const fn new(start: usize) -> Self {
        SkinIndex {
            index: (start * size_of::<Mat4>()) as u32,
        }
    }
}

/// Maps each skinned mesh to the applicable offset within the [`SkinUniforms`]
/// buffer.
///
/// We store both the current frame's joint matrices and the previous frame's
/// joint matrices for the purposes of motion vector calculation.
#[derive(Default, Resource)]
pub struct SkinIndices {
    /// Maps each skinned mesh to the applicable offset within
    /// [`SkinUniforms::current_buffer`].
    pub current: EntityHashMap<SkinIndex>,

    /// Maps each skinned mesh to the applicable offset within
    /// [`SkinUniforms::prev_buffer`].
    pub prev: EntityHashMap<SkinIndex>,
}

/// The GPU buffers containing joint matrices for all skinned meshes.
///
/// This is double-buffered: we store the joint matrices of each mesh for the
/// previous frame in addition to those of each mesh for the current frame. This
/// is for motion vector calculation. Every frame, we swap buffers and overwrite
/// the joint matrix buffer from two frames ago with the data for the current
/// frame.
///
/// Notes on implementation: see comment on top of the `extract_skins` system.
#[derive(Resource)]
pub struct SkinUniforms {
    /// Stores all the joint matrices for skinned meshes in the current frame.
    pub current_buffer: RawBufferVec<Mat4>,
    /// Stores all the joint matrices for skinned meshes in the previous frame.
    pub prev_buffer: RawBufferVec<Mat4>,
}

impl Default for SkinUniforms {
    fn default() -> Self {
        Self {
            current_buffer: RawBufferVec::new(BufferUsages::UNIFORM),
            prev_buffer: RawBufferVec::new(BufferUsages::UNIFORM),
        }
    }
}

pub fn prepare_skins(
    render_device: Res<RenderDevice>,
    render_queue: Res<RenderQueue>,
    mut uniform: ResMut<SkinUniforms>,
) {
    if uniform.current_buffer.is_empty() {
        return;
    }

    let len = uniform.current_buffer.len();
    uniform.current_buffer.reserve(len, &render_device);
    uniform
        .current_buffer
        .write_buffer(&render_device, &render_queue);

    // We don't need to write `uniform.prev_buffer` because we already wrote it
    // last frame, and the data should still be on the GPU.
}

// Notes on implementation:
// We define the uniform binding as an array<mat4x4<f32>, N> in the shader,
// where N is the maximum number of Mat4s we can fit in the uniform binding,
// which may be as little as 16kB or 64kB. But, we may not need all N.
// We may only need, for example, 10.
//
// If we used uniform buffers ‘normally’ then we would have to write a full
// binding of data for each dynamic offset binding, which is wasteful, makes
// the buffer much larger than it needs to be, and uses more memory bandwidth
// to transfer the data, which then costs frame time So @superdump came up
// with this design: just bind data at the specified offset and interpret
// the data at that offset as an array<T, N> regardless of what is there.
//
// So instead of writing N Mat4s when you only need 10, you write 10, and
// then pad up to the next dynamic offset alignment. Then write the next.
// And for the last dynamic offset binding, make sure there is a full binding
// of data after it so that the buffer is of size
// `last dynamic offset` + `array<mat4x4<f32>>`.
//
// Then when binding the first dynamic offset, the first 10 entries in the array
// are what you expect, but if you read the 11th you’re reading ‘invalid’ data
// which could be padding or could be from the next binding.
//
// In this way, we can pack ‘variable sized arrays’ into uniform buffer bindings
// which normally only support fixed size arrays. You just have to make sure
// in the shader that you only read the values that are valid for that binding.
pub fn extract_skins(
    skin_indices: ResMut<SkinIndices>,
    uniform: ResMut<SkinUniforms>,
    query: Extract<Query<(Entity, &ViewVisibility, &SkinnedMesh)>>,
    inverse_bindposes: Extract<Res<Assets<SkinnedMeshInverseBindposes>>>,
    joints: Extract<Query<&GlobalTransform>>,
) {
    // Borrow check workaround.
    let (skin_indices, uniform) = (skin_indices.into_inner(), uniform.into_inner());

    // Swap buffers. We need to keep the previous frame's buffer around for the
    // purposes of motion vector computation.
    mem::swap(&mut skin_indices.current, &mut skin_indices.prev);
    mem::swap(&mut uniform.current_buffer, &mut uniform.prev_buffer);
    skin_indices.current.clear();
    uniform.current_buffer.clear();

    let mut last_start = 0;

    // PERF: This can be expensive, can we move this to prepare?
    for (entity, view_visibility, skin) in &query {
        if !view_visibility.get() {
            continue;
        }
        let buffer = &mut uniform.current_buffer;
        let Some(inverse_bindposes) = inverse_bindposes.get(&skin.inverse_bindposes) else {
            continue;
        };
        let start = buffer.len();

        let target = start + skin.joints.len().min(MAX_JOINTS);
        buffer.extend(
            joints
                .iter_many(&skin.joints)
                .zip(inverse_bindposes.iter())
                .take(MAX_JOINTS)
                .map(|(joint, bindpose)| joint.affine() * *bindpose),
        );
        // iter_many will skip any failed fetches. This will cause it to assign the wrong bones,
        // so just bail by truncating to the start.
        if buffer.len() != target {
            buffer.truncate(start);
            continue;
        }
        last_start = last_start.max(start);

        // Pad to 256 byte alignment
        while buffer.len() % 4 != 0 {
            buffer.push(Mat4::ZERO);
        }

        skin_indices.current.insert(entity, SkinIndex::new(start));
    }

    // Pad out the buffer to ensure that there's enough space for bindings
    while uniform.current_buffer.len() - last_start < MAX_JOINTS {
        uniform.current_buffer.push(Mat4::ZERO);
    }
}

// NOTE: The skinned joints uniform buffer has to be bound at a dynamic offset per
// entity and so cannot currently be batched.
pub fn no_automatic_skin_batching(
    mut commands: Commands,
    query: Query<Entity, (With<SkinnedMesh>, Without<NoAutomaticBatching>)>,
) {
    for entity in &query {
        commands.entity(entity).try_insert(NoAutomaticBatching);
    }
}
-												Add `core` and `alloc` over `std` Lints (#15281)

# Objective

- Fixes #6370
- Closes #6581

## Solution

- Added the following lints to the workspace:
  - `std_instead_of_core`
  - `std_instead_of_alloc`
  - `alloc_instead_of_core`
- Used `cargo +nightly fmt` with [item level use
formatting](https://rust-lang.github.io/rustfmt/?version=v1.6.0&search=#Item%5C%3A)
to split all `use` statements into single items.
- Used `cargo clippy --workspace --all-targets --all-features --fix
--allow-dirty` to _attempt_ to resolve the new linting issues, and
intervened where the lint was unable to resolve the issue automatically
(usually due to needing an `extern crate alloc;` statement in a crate
root).
- Manually removed certain uses of `std` where negative feature gating
prevented `--all-features` from finding the offending uses.
- Used `cargo +nightly fmt` with [crate level use
formatting](https://rust-lang.github.io/rustfmt/?version=v1.6.0&search=#Crate%5C%3A)
to re-merge all `use` statements matching Bevy's previous styling.
- Manually fixed cases where the `fmt` tool could not re-merge `use`
statements due to conditional compilation attributes.

## Testing

- Ran CI locally

## Migration Guide

The MSRV is now 1.81. Please update to this version or higher.

## Notes

- This is a _massive_ change to try and push through, which is why I've
outlined the semi-automatic steps I used to create this PR, in case this
fails and someone else tries again in the future.
- Making this change has no impact on user code, but does mean Bevy
contributors will be warned to use `core` and `alloc` instead of `std`
where possible.
- This lint is a critical first step towards investigating `no_std`
options for Bevy.

---------

Co-authored-by: François Mockers <francois.mockers@vleue.com>
											
										
										
											2024-09-27 00:59:59 +00:00
+								use core::mem::{self, size_of};
-												Implement motion vectors and TAA for skinned meshes and meshes with morph targets. (#13572)

This is a revamped equivalent to #9902, though it shares none of the
code. It handles all special cases that I've tested correctly.

The overall technique consists of double-buffering the joint matrix and
morph weights buffers, as most of the previous attempts to solve this
problem did. The process is generally straightforward. Note that, to
avoid regressing the ability of mesh extraction, skin extraction, and
morph target extraction to run in parallel, I had to add a new system to
rendering, `set_mesh_motion_vector_flags`. The comment there explains
the details; it generally runs very quickly.

I've tested this with modified versions of the `animated_fox`,
`morph_targets`, and `many_foxes` examples that add TAA, and the patch
works. To avoid bloating those examples, I didn't add switches for TAA
to them.

Addresses points (1) and (2) of #8423.

## Changelog

### Fixed

* Motion vectors, and therefore TAA, are now supported for meshes with
skins and/or morph targets.
											
										
										
											2024-05-31 17:02:28 +00:00
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								use bevy_asset::Assets;
-												Simpler lint fixes: makes `ci lints` work but disables a lint for now (#15376)

Takes the first two commits from #15375 and adds suggestions from this
comment:
https://github.com/bevyengine/bevy/pull/15375#issuecomment-2366968300

See #15375 for more reasoning/motivation.

## Rebasing (rerunning)

```rust
git switch simpler-lint-fixes
git reset --hard main
cargo fmt --all -- --unstable-features --config normalize_comments=true,imports_granularity=Crate
cargo fmt --all
git add --update
git commit --message "rustfmt"
cargo clippy --workspace --all-targets --all-features --fix
cargo fmt --all -- --unstable-features --config normalize_comments=true,imports_granularity=Crate
cargo fmt --all
git add --update
git commit --message "clippy"
git cherry-pick e6c0b94f6795222310fb812fa5c4512661fc7887
```
											
										
										
											2024-09-24 11:42:59 +00:00
+								use bevy_ecs::{entity::EntityHashMap, prelude::*};
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								use bevy_math::Mat4;
 								use bevy_render::{
 								    batching::NoAutomaticBatching,
 								    mesh::skinning::{SkinnedMesh, SkinnedMeshInverseBindposes},
-												Add BufferVec, an higher-performance alternative to StorageBuffer, and make GpuArrayBuffer use it. (#13199)

This is an adoption of #12670 plus some documentation fixes. See that PR
for more details.

---

## Changelog

* Renamed `BufferVec` to `RawBufferVec` and added a new `BufferVec`
type.

## Migration Guide
`BufferVec` has been renamed to `RawBufferVec` and a new similar type
has taken the `BufferVec` name.

---------

Co-authored-by: Patrick Walton <pcwalton@mimiga.net>
Co-authored-by: Alice Cecile <alice.i.cecile@gmail.com>
Co-authored-by: IceSentry <IceSentry@users.noreply.github.com>
											
										
										
											2024-05-03 11:39:21 +00:00
+								    render_resource::{BufferUsages, RawBufferVec},
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								    renderer::{RenderDevice, RenderQueue},
 								    view::ViewVisibility,
 								    Extract,
 								};
 								use bevy_transform::prelude::GlobalTransform;
 								/// Maximum number of joints supported for skinned meshes.
 								pub const MAX_JOINTS: usize = 256;
 								#[derive(Component)]
 								pub struct SkinIndex {
 								    pub index: u32,
 								}
-												Use EntityHashMap<Entity, T> for render world entity storage for better performance (#9903)

# Objective

- Improve rendering performance, particularly by avoiding the large
system commands costs of using the ECS in the way that the render world
does.

## Solution

- Define `EntityHasher` that calculates a hash from the
`Entity.to_bits()` by `i | (i.wrapping_mul(0x517cc1b727220a95) << 32)`.
`0x517cc1b727220a95` is something like `u64::MAX / N` for N that gives a
value close to π and that works well for hashing. Thanks for @SkiFire13
for the suggestion and to @nicopap for alternative suggestions and
discussion. This approach comes from `rustc-hash` (a.k.a. `FxHasher`)
with some tweaks for the case of hashing an `Entity`. `FxHasher` and
`SeaHasher` were also tested but were significantly slower.
- Define `EntityHashMap` type that uses the `EntityHashser`
- Use `EntityHashMap<Entity, T>` for render world entity storage,
including:
- `RenderMaterialInstances` - contains the `AssetId<M>` of the material
associated with the entity. Also for 2D.
- `RenderMeshInstances` - contains mesh transforms, flags and properties
about mesh entities. Also for 2D.
- `SkinIndices` and `MorphIndices` - contains the skin and morph index
for an entity, respectively
  - `ExtractedSprites`
  - `ExtractedUiNodes`

## Benchmarks

All benchmarks have been conducted on an M1 Max connected to AC power.
The tests are run for 1500 frames. The 1000th frame is captured for
comparison to check for visual regressions. There were none.

### 2D Meshes

`bevymark --benchmark --waves 160 --per-wave 1000 --mode mesh2d`

#### `--ordered-z`

This test spawns the 2D meshes with z incrementing back to front, which
is the ideal arrangement allocation order as it matches the sorted
render order which means lookups have a high cache hit rate.

<img width="1112" alt="Screenshot 2023-09-27 at 07 50 45"
src="https://github.com/bevyengine/bevy/assets/302146/e140bc98-7091-4a3b-8ae1-ab75d16d2ccb">

-39.1% median frame time.

#### Random

This test spawns the 2D meshes with random z. This not only makes the
batching and transparent 2D pass lookups get a lot of cache misses, it
also currently means that the meshes are almost certain to not be
batchable.

<img width="1108" alt="Screenshot 2023-09-27 at 07 51 28"
src="https://github.com/bevyengine/bevy/assets/302146/29c2e813-645a-43ce-982a-55df4bf7d8c4">

-7.2% median frame time.

### 3D Meshes

`many_cubes --benchmark`

<img width="1112" alt="Screenshot 2023-09-27 at 07 51 57"
src="https://github.com/bevyengine/bevy/assets/302146/1a729673-3254-4e2a-9072-55e27c69f0fc">

-7.7% median frame time.

### Sprites

**NOTE: On `main` sprites are using `SparseSet<Entity, T>`!**

`bevymark --benchmark --waves 160 --per-wave 1000 --mode sprite`

#### `--ordered-z`

This test spawns the sprites with z incrementing back to front, which is
the ideal arrangement allocation order as it matches the sorted render
order which means lookups have a high cache hit rate.

<img width="1116" alt="Screenshot 2023-09-27 at 07 52 31"
src="https://github.com/bevyengine/bevy/assets/302146/bc8eab90-e375-4d31-b5cd-f55f6f59ab67">

+13.0% median frame time.

#### Random

This test spawns the sprites with random z. This makes the batching and
transparent 2D pass lookups get a lot of cache misses.

<img width="1109" alt="Screenshot 2023-09-27 at 07 53 01"
src="https://github.com/bevyengine/bevy/assets/302146/22073f5d-99a7-49b0-9584-d3ac3eac3033">

+0.6% median frame time.

### UI

**NOTE: On `main` UI is using `SparseSet<Entity, T>`!**

`many_buttons`

<img width="1111" alt="Screenshot 2023-09-27 at 07 53 26"
src="https://github.com/bevyengine/bevy/assets/302146/66afd56d-cbe4-49e7-8b64-2f28f6043d85">

+15.1% median frame time.

## Alternatives

- Cart originally suggested trying out `SparseSet<Entity, T>` and indeed
that is slightly faster under ideal conditions. However,
`PassHashMap<Entity, T>` has better worst case performance when data is
randomly distributed, rather than in sorted render order, and does not
have the worst case memory usage that `SparseSet`'s dense `Vec<usize>`
that maps from the `Entity` index to sparse index into `Vec<T>`. This
dense `Vec` has to be as large as the largest Entity index used with the
`SparseSet`.
- I also tested `PassHashMap<u32, T>`, intending to use `Entity.index()`
as the key, but this proved to sometimes be slower and mostly no
different.
- The only outstanding approach that has not been implemented and tested
is to _not_ clear the render world of its entities each frame. That has
its own problems, though they could perhaps be solved.
- Performance-wise, if the entities and their component data were not
cleared, then they would incur table moves on spawn, and should not
thereafter, rather just their component data would be overwritten.
Ideally we would have a neat way of either updating data in-place via
`&mut T` queries, or inserting components if not present. This would
likely be quite cumbersome to have to remember to do everywhere, but
perhaps it only needs to be done in the more performance-sensitive
systems.
- The main problem to solve however is that we want to both maintain a
mapping between main world entities and render world entities, be able
to run the render app and world in parallel with the main app and world
for pipelined rendering, and at the same time be able to spawn entities
in the render world in such a way that those Entity ids do not collide
with those spawned in the main world. This is potentially quite
solvable, but could well be a lot of ECS work to do it in a way that
makes sense.

---

## Changelog

- Changed: Component data for entities to be drawn are no longer stored
on entities in the render world. Instead, data is stored in a
`EntityHashMap<Entity, T>` in various resources. This brings significant
performance benefits due to the way the render app clears entities every
frame. Resources of most interest are `RenderMeshInstances` and
`RenderMaterialInstances`, and their 2D counterparts.

## Migration Guide

Previously the render app extracted mesh entities and their component
data from the main world and stored them as entities and components in
the render world. Now they are extracted into essentially
`EntityHashMap<Entity, T>` where `T` are structs containing an
appropriate group of data. This means that while extract set systems
will continue to run extract queries against the main world they will
store their data in hash maps. Also, systems in later sets will either
need to look up entities in the available resources such as
`RenderMeshInstances`, or maintain their own `EntityHashMap<Entity, T>`
for their own data.

Before:
```rust
fn queue_custom(
    material_meshes: Query<(Entity, &MeshTransforms, &Handle<Mesh>), With<InstanceMaterialData>>,
) {
    ...
    for (entity, mesh_transforms, mesh_handle) in &material_meshes {
        ...
    }
}
```

After:
```rust
fn queue_custom(
    render_mesh_instances: Res<RenderMeshInstances>,
    instance_entities: Query<Entity, With<InstanceMaterialData>>,
) {
    ...
    for entity in &instance_entities {
        let Some(mesh_instance) = render_mesh_instances.get(&entity) else { continue; };
        // The mesh handle in `AssetId<Mesh>` form, and the `MeshTransforms` can now
        // be found in `mesh_instance` which is a `RenderMeshInstance`
        ...
    }
}
```

---------

Co-authored-by: robtfm <50659922+robtfm@users.noreply.github.com>
											
										
										
											2023-09-27 08:28:28 +00:00
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								impl SkinIndex {
-												Fix intra-doc links and make CI test them (#14076)

# Objective

- Bevy currently has lot of invalid intra-doc links, let's fix them!
- Also make CI test them, to avoid future regressions.
- Helps with #1983 (but doesn't fix it, as there could still be explicit
links to docs.rs that are broken)

## Solution

- Make `cargo r -p ci -- doc-check` check fail on warnings (could also
be changed to just some specific lints)
- Manually fix all the warnings (note that in some cases it was unclear
to me what the fix should have been, I'll try to highlight them in a
self-review)
											
										
										
											2024-07-11 13:08:31 +00:00
+								    /// Index to be in address space based on the size of a skin uniform.
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								    const fn new(start: usize) -> Self {
 								        SkinIndex {
-												Apply unused_qualifications lint (#14828)

# Objective

Fixes #14782

## Solution

Enable the lint and fix all upcoming hints (`--fix`). Also tried to
figure out the false-positive (see review comment). Maybe split this PR
up into multiple parts where only the last one enables the lint, so some
can already be merged resulting in less many files touched / less
potential for merge conflicts?

Currently, there are some cases where it might be easier to read the
code with the qualifier, so perhaps remove the import of it and adapt
its cases? In the current stage it's just a plain adoption of the
suggestions in order to have a base to discuss.

## Testing

`cargo clippy` and `cargo run -p ci` are happy.
											
										
										
											2024-08-21 12:29:33 +00:00
+								            index: (start * size_of::<Mat4>()) as u32,
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								        }
 								    }
 								}
-												Implement motion vectors and TAA for skinned meshes and meshes with morph targets. (#13572)

This is a revamped equivalent to #9902, though it shares none of the
code. It handles all special cases that I've tested correctly.

The overall technique consists of double-buffering the joint matrix and
morph weights buffers, as most of the previous attempts to solve this
problem did. The process is generally straightforward. Note that, to
avoid regressing the ability of mesh extraction, skin extraction, and
morph target extraction to run in parallel, I had to add a new system to
rendering, `set_mesh_motion_vector_flags`. The comment there explains
the details; it generally runs very quickly.

I've tested this with modified versions of the `animated_fox`,
`morph_targets`, and `many_foxes` examples that add TAA, and the patch
works. To avoid bloating those examples, I didn't add switches for TAA
to them.

Addresses points (1) and (2) of #8423.

## Changelog

### Fixed

* Motion vectors, and therefore TAA, are now supported for meshes with
skins and/or morph targets.
											
										
										
											2024-05-31 17:02:28 +00:00
+								/// Maps each skinned mesh to the applicable offset within the [`SkinUniforms`]
 								/// buffer.
 								///
 								/// We store both the current frame's joint matrices and the previous frame's
 								/// joint matrices for the purposes of motion vector calculation.
 								#[derive(Default, Resource)]
 								pub struct SkinIndices {
 								    /// Maps each skinned mesh to the applicable offset within
 								    /// [`SkinUniforms::current_buffer`].
 								    pub current: EntityHashMap<SkinIndex>,
 								    /// Maps each skinned mesh to the applicable offset within
 								    /// [`SkinUniforms::prev_buffer`].
 								    pub prev: EntityHashMap<SkinIndex>,
 								}
-												Use EntityHashMap<Entity, T> for render world entity storage for better performance (#9903)

# Objective

- Improve rendering performance, particularly by avoiding the large
system commands costs of using the ECS in the way that the render world
does.

## Solution

- Define `EntityHasher` that calculates a hash from the
`Entity.to_bits()` by `i | (i.wrapping_mul(0x517cc1b727220a95) << 32)`.
`0x517cc1b727220a95` is something like `u64::MAX / N` for N that gives a
value close to π and that works well for hashing. Thanks for @SkiFire13
for the suggestion and to @nicopap for alternative suggestions and
discussion. This approach comes from `rustc-hash` (a.k.a. `FxHasher`)
with some tweaks for the case of hashing an `Entity`. `FxHasher` and
`SeaHasher` were also tested but were significantly slower.
- Define `EntityHashMap` type that uses the `EntityHashser`
- Use `EntityHashMap<Entity, T>` for render world entity storage,
including:
- `RenderMaterialInstances` - contains the `AssetId<M>` of the material
associated with the entity. Also for 2D.
- `RenderMeshInstances` - contains mesh transforms, flags and properties
about mesh entities. Also for 2D.
- `SkinIndices` and `MorphIndices` - contains the skin and morph index
for an entity, respectively
  - `ExtractedSprites`
  - `ExtractedUiNodes`

## Benchmarks

All benchmarks have been conducted on an M1 Max connected to AC power.
The tests are run for 1500 frames. The 1000th frame is captured for
comparison to check for visual regressions. There were none.

### 2D Meshes

`bevymark --benchmark --waves 160 --per-wave 1000 --mode mesh2d`

#### `--ordered-z`

This test spawns the 2D meshes with z incrementing back to front, which
is the ideal arrangement allocation order as it matches the sorted
render order which means lookups have a high cache hit rate.

<img width="1112" alt="Screenshot 2023-09-27 at 07 50 45"
src="https://github.com/bevyengine/bevy/assets/302146/e140bc98-7091-4a3b-8ae1-ab75d16d2ccb">

-39.1% median frame time.

#### Random

This test spawns the 2D meshes with random z. This not only makes the
batching and transparent 2D pass lookups get a lot of cache misses, it
also currently means that the meshes are almost certain to not be
batchable.

<img width="1108" alt="Screenshot 2023-09-27 at 07 51 28"
src="https://github.com/bevyengine/bevy/assets/302146/29c2e813-645a-43ce-982a-55df4bf7d8c4">

-7.2% median frame time.

### 3D Meshes

`many_cubes --benchmark`

<img width="1112" alt="Screenshot 2023-09-27 at 07 51 57"
src="https://github.com/bevyengine/bevy/assets/302146/1a729673-3254-4e2a-9072-55e27c69f0fc">

-7.7% median frame time.

### Sprites

**NOTE: On `main` sprites are using `SparseSet<Entity, T>`!**

`bevymark --benchmark --waves 160 --per-wave 1000 --mode sprite`

#### `--ordered-z`

This test spawns the sprites with z incrementing back to front, which is
the ideal arrangement allocation order as it matches the sorted render
order which means lookups have a high cache hit rate.

<img width="1116" alt="Screenshot 2023-09-27 at 07 52 31"
src="https://github.com/bevyengine/bevy/assets/302146/bc8eab90-e375-4d31-b5cd-f55f6f59ab67">

+13.0% median frame time.

#### Random

This test spawns the sprites with random z. This makes the batching and
transparent 2D pass lookups get a lot of cache misses.

<img width="1109" alt="Screenshot 2023-09-27 at 07 53 01"
src="https://github.com/bevyengine/bevy/assets/302146/22073f5d-99a7-49b0-9584-d3ac3eac3033">

+0.6% median frame time.

### UI

**NOTE: On `main` UI is using `SparseSet<Entity, T>`!**

`many_buttons`

<img width="1111" alt="Screenshot 2023-09-27 at 07 53 26"
src="https://github.com/bevyengine/bevy/assets/302146/66afd56d-cbe4-49e7-8b64-2f28f6043d85">

+15.1% median frame time.

## Alternatives

- Cart originally suggested trying out `SparseSet<Entity, T>` and indeed
that is slightly faster under ideal conditions. However,
`PassHashMap<Entity, T>` has better worst case performance when data is
randomly distributed, rather than in sorted render order, and does not
have the worst case memory usage that `SparseSet`'s dense `Vec<usize>`
that maps from the `Entity` index to sparse index into `Vec<T>`. This
dense `Vec` has to be as large as the largest Entity index used with the
`SparseSet`.
- I also tested `PassHashMap<u32, T>`, intending to use `Entity.index()`
as the key, but this proved to sometimes be slower and mostly no
different.
- The only outstanding approach that has not been implemented and tested
is to _not_ clear the render world of its entities each frame. That has
its own problems, though they could perhaps be solved.
- Performance-wise, if the entities and their component data were not
cleared, then they would incur table moves on spawn, and should not
thereafter, rather just their component data would be overwritten.
Ideally we would have a neat way of either updating data in-place via
`&mut T` queries, or inserting components if not present. This would
likely be quite cumbersome to have to remember to do everywhere, but
perhaps it only needs to be done in the more performance-sensitive
systems.
- The main problem to solve however is that we want to both maintain a
mapping between main world entities and render world entities, be able
to run the render app and world in parallel with the main app and world
for pipelined rendering, and at the same time be able to spawn entities
in the render world in such a way that those Entity ids do not collide
with those spawned in the main world. This is potentially quite
solvable, but could well be a lot of ECS work to do it in a way that
makes sense.

---

## Changelog

- Changed: Component data for entities to be drawn are no longer stored
on entities in the render world. Instead, data is stored in a
`EntityHashMap<Entity, T>` in various resources. This brings significant
performance benefits due to the way the render app clears entities every
frame. Resources of most interest are `RenderMeshInstances` and
`RenderMaterialInstances`, and their 2D counterparts.

## Migration Guide

Previously the render app extracted mesh entities and their component
data from the main world and stored them as entities and components in
the render world. Now they are extracted into essentially
`EntityHashMap<Entity, T>` where `T` are structs containing an
appropriate group of data. This means that while extract set systems
will continue to run extract queries against the main world they will
store their data in hash maps. Also, systems in later sets will either
need to look up entities in the available resources such as
`RenderMeshInstances`, or maintain their own `EntityHashMap<Entity, T>`
for their own data.

Before:
```rust
fn queue_custom(
    material_meshes: Query<(Entity, &MeshTransforms, &Handle<Mesh>), With<InstanceMaterialData>>,
) {
    ...
    for (entity, mesh_transforms, mesh_handle) in &material_meshes {
        ...
    }
}
```

After:
```rust
fn queue_custom(
    render_mesh_instances: Res<RenderMeshInstances>,
    instance_entities: Query<Entity, With<InstanceMaterialData>>,
) {
    ...
    for entity in &instance_entities {
        let Some(mesh_instance) = render_mesh_instances.get(&entity) else { continue; };
        // The mesh handle in `AssetId<Mesh>` form, and the `MeshTransforms` can now
        // be found in `mesh_instance` which is a `RenderMeshInstance`
        ...
    }
}
```

---------

Co-authored-by: robtfm <50659922+robtfm@users.noreply.github.com>
											
										
										
											2023-09-27 08:28:28 +00:00
-												Implement motion vectors and TAA for skinned meshes and meshes with morph targets. (#13572)

This is a revamped equivalent to #9902, though it shares none of the
code. It handles all special cases that I've tested correctly.

The overall technique consists of double-buffering the joint matrix and
morph weights buffers, as most of the previous attempts to solve this
problem did. The process is generally straightforward. Note that, to
avoid regressing the ability of mesh extraction, skin extraction, and
morph target extraction to run in parallel, I had to add a new system to
rendering, `set_mesh_motion_vector_flags`. The comment there explains
the details; it generally runs very quickly.

I've tested this with modified versions of the `animated_fox`,
`morph_targets`, and `many_foxes` examples that add TAA, and the patch
works. To avoid bloating those examples, I didn't add switches for TAA
to them.

Addresses points (1) and (2) of #8423.

## Changelog

### Fixed

* Motion vectors, and therefore TAA, are now supported for meshes with
skins and/or morph targets.
											
										
										
											2024-05-31 17:02:28 +00:00
+								/// The GPU buffers containing joint matrices for all skinned meshes.
 								///
 								/// This is double-buffered: we store the joint matrices of each mesh for the
 								/// previous frame in addition to those of each mesh for the current frame. This
 								/// is for motion vector calculation. Every frame, we swap buffers and overwrite
 								/// the joint matrix buffer from two frames ago with the data for the current
 								/// frame.
 								///
 								/// Notes on implementation: see comment on top of the `extract_skins` system.
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								#[derive(Resource)]
-												Implement motion vectors and TAA for skinned meshes and meshes with morph targets. (#13572)

This is a revamped equivalent to #9902, though it shares none of the
code. It handles all special cases that I've tested correctly.

The overall technique consists of double-buffering the joint matrix and
morph weights buffers, as most of the previous attempts to solve this
problem did. The process is generally straightforward. Note that, to
avoid regressing the ability of mesh extraction, skin extraction, and
morph target extraction to run in parallel, I had to add a new system to
rendering, `set_mesh_motion_vector_flags`. The comment there explains
the details; it generally runs very quickly.

I've tested this with modified versions of the `animated_fox`,
`morph_targets`, and `many_foxes` examples that add TAA, and the patch
works. To avoid bloating those examples, I didn't add switches for TAA
to them.

Addresses points (1) and (2) of #8423.

## Changelog

### Fixed

* Motion vectors, and therefore TAA, are now supported for meshes with
skins and/or morph targets.
											
										
										
											2024-05-31 17:02:28 +00:00
+								pub struct SkinUniforms {
 								    /// Stores all the joint matrices for skinned meshes in the current frame.
 								    pub current_buffer: RawBufferVec<Mat4>,
 								    /// Stores all the joint matrices for skinned meshes in the previous frame.
 								    pub prev_buffer: RawBufferVec<Mat4>,
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								}
-												Use EntityHashMap<Entity, T> for render world entity storage for better performance (#9903)

# Objective

- Improve rendering performance, particularly by avoiding the large
system commands costs of using the ECS in the way that the render world
does.

## Solution

- Define `EntityHasher` that calculates a hash from the
`Entity.to_bits()` by `i | (i.wrapping_mul(0x517cc1b727220a95) << 32)`.
`0x517cc1b727220a95` is something like `u64::MAX / N` for N that gives a
value close to π and that works well for hashing. Thanks for @SkiFire13
for the suggestion and to @nicopap for alternative suggestions and
discussion. This approach comes from `rustc-hash` (a.k.a. `FxHasher`)
with some tweaks for the case of hashing an `Entity`. `FxHasher` and
`SeaHasher` were also tested but were significantly slower.
- Define `EntityHashMap` type that uses the `EntityHashser`
- Use `EntityHashMap<Entity, T>` for render world entity storage,
including:
- `RenderMaterialInstances` - contains the `AssetId<M>` of the material
associated with the entity. Also for 2D.
- `RenderMeshInstances` - contains mesh transforms, flags and properties
about mesh entities. Also for 2D.
- `SkinIndices` and `MorphIndices` - contains the skin and morph index
for an entity, respectively
  - `ExtractedSprites`
  - `ExtractedUiNodes`

## Benchmarks

All benchmarks have been conducted on an M1 Max connected to AC power.
The tests are run for 1500 frames. The 1000th frame is captured for
comparison to check for visual regressions. There were none.

### 2D Meshes

`bevymark --benchmark --waves 160 --per-wave 1000 --mode mesh2d`

#### `--ordered-z`

This test spawns the 2D meshes with z incrementing back to front, which
is the ideal arrangement allocation order as it matches the sorted
render order which means lookups have a high cache hit rate.

<img width="1112" alt="Screenshot 2023-09-27 at 07 50 45"
src="https://github.com/bevyengine/bevy/assets/302146/e140bc98-7091-4a3b-8ae1-ab75d16d2ccb">

-39.1% median frame time.

#### Random

This test spawns the 2D meshes with random z. This not only makes the
batching and transparent 2D pass lookups get a lot of cache misses, it
also currently means that the meshes are almost certain to not be
batchable.

<img width="1108" alt="Screenshot 2023-09-27 at 07 51 28"
src="https://github.com/bevyengine/bevy/assets/302146/29c2e813-645a-43ce-982a-55df4bf7d8c4">

-7.2% median frame time.

### 3D Meshes

`many_cubes --benchmark`

<img width="1112" alt="Screenshot 2023-09-27 at 07 51 57"
src="https://github.com/bevyengine/bevy/assets/302146/1a729673-3254-4e2a-9072-55e27c69f0fc">

-7.7% median frame time.

### Sprites

**NOTE: On `main` sprites are using `SparseSet<Entity, T>`!**

`bevymark --benchmark --waves 160 --per-wave 1000 --mode sprite`

#### `--ordered-z`

This test spawns the sprites with z incrementing back to front, which is
the ideal arrangement allocation order as it matches the sorted render
order which means lookups have a high cache hit rate.

<img width="1116" alt="Screenshot 2023-09-27 at 07 52 31"
src="https://github.com/bevyengine/bevy/assets/302146/bc8eab90-e375-4d31-b5cd-f55f6f59ab67">

+13.0% median frame time.

#### Random

This test spawns the sprites with random z. This makes the batching and
transparent 2D pass lookups get a lot of cache misses.

<img width="1109" alt="Screenshot 2023-09-27 at 07 53 01"
src="https://github.com/bevyengine/bevy/assets/302146/22073f5d-99a7-49b0-9584-d3ac3eac3033">

+0.6% median frame time.

### UI

**NOTE: On `main` UI is using `SparseSet<Entity, T>`!**

`many_buttons`

<img width="1111" alt="Screenshot 2023-09-27 at 07 53 26"
src="https://github.com/bevyengine/bevy/assets/302146/66afd56d-cbe4-49e7-8b64-2f28f6043d85">

+15.1% median frame time.

## Alternatives

- Cart originally suggested trying out `SparseSet<Entity, T>` and indeed
that is slightly faster under ideal conditions. However,
`PassHashMap<Entity, T>` has better worst case performance when data is
randomly distributed, rather than in sorted render order, and does not
have the worst case memory usage that `SparseSet`'s dense `Vec<usize>`
that maps from the `Entity` index to sparse index into `Vec<T>`. This
dense `Vec` has to be as large as the largest Entity index used with the
`SparseSet`.
- I also tested `PassHashMap<u32, T>`, intending to use `Entity.index()`
as the key, but this proved to sometimes be slower and mostly no
different.
- The only outstanding approach that has not been implemented and tested
is to _not_ clear the render world of its entities each frame. That has
its own problems, though they could perhaps be solved.
- Performance-wise, if the entities and their component data were not
cleared, then they would incur table moves on spawn, and should not
thereafter, rather just their component data would be overwritten.
Ideally we would have a neat way of either updating data in-place via
`&mut T` queries, or inserting components if not present. This would
likely be quite cumbersome to have to remember to do everywhere, but
perhaps it only needs to be done in the more performance-sensitive
systems.
- The main problem to solve however is that we want to both maintain a
mapping between main world entities and render world entities, be able
to run the render app and world in parallel with the main app and world
for pipelined rendering, and at the same time be able to spawn entities
in the render world in such a way that those Entity ids do not collide
with those spawned in the main world. This is potentially quite
solvable, but could well be a lot of ECS work to do it in a way that
makes sense.

---

## Changelog

- Changed: Component data for entities to be drawn are no longer stored
on entities in the render world. Instead, data is stored in a
`EntityHashMap<Entity, T>` in various resources. This brings significant
performance benefits due to the way the render app clears entities every
frame. Resources of most interest are `RenderMeshInstances` and
`RenderMaterialInstances`, and their 2D counterparts.

## Migration Guide

Previously the render app extracted mesh entities and their component
data from the main world and stored them as entities and components in
the render world. Now they are extracted into essentially
`EntityHashMap<Entity, T>` where `T` are structs containing an
appropriate group of data. This means that while extract set systems
will continue to run extract queries against the main world they will
store their data in hash maps. Also, systems in later sets will either
need to look up entities in the available resources such as
`RenderMeshInstances`, or maintain their own `EntityHashMap<Entity, T>`
for their own data.

Before:
```rust
fn queue_custom(
    material_meshes: Query<(Entity, &MeshTransforms, &Handle<Mesh>), With<InstanceMaterialData>>,
) {
    ...
    for (entity, mesh_transforms, mesh_handle) in &material_meshes {
        ...
    }
}
```

After:
```rust
fn queue_custom(
    render_mesh_instances: Res<RenderMeshInstances>,
    instance_entities: Query<Entity, With<InstanceMaterialData>>,
) {
    ...
    for entity in &instance_entities {
        let Some(mesh_instance) = render_mesh_instances.get(&entity) else { continue; };
        // The mesh handle in `AssetId<Mesh>` form, and the `MeshTransforms` can now
        // be found in `mesh_instance` which is a `RenderMeshInstance`
        ...
    }
}
```

---------

Co-authored-by: robtfm <50659922+robtfm@users.noreply.github.com>
											
										
										
											2023-09-27 08:28:28 +00:00
-												Implement motion vectors and TAA for skinned meshes and meshes with morph targets. (#13572)

This is a revamped equivalent to #9902, though it shares none of the
code. It handles all special cases that I've tested correctly.

The overall technique consists of double-buffering the joint matrix and
morph weights buffers, as most of the previous attempts to solve this
problem did. The process is generally straightforward. Note that, to
avoid regressing the ability of mesh extraction, skin extraction, and
morph target extraction to run in parallel, I had to add a new system to
rendering, `set_mesh_motion_vector_flags`. The comment there explains
the details; it generally runs very quickly.

I've tested this with modified versions of the `animated_fox`,
`morph_targets`, and `many_foxes` examples that add TAA, and the patch
works. To avoid bloating those examples, I didn't add switches for TAA
to them.

Addresses points (1) and (2) of #8423.

## Changelog

### Fixed

* Motion vectors, and therefore TAA, are now supported for meshes with
skins and/or morph targets.
											
										
										
											2024-05-31 17:02:28 +00:00
+								impl Default for SkinUniforms {
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								    fn default() -> Self {
 								        Self {
-												Implement motion vectors and TAA for skinned meshes and meshes with morph targets. (#13572)

This is a revamped equivalent to #9902, though it shares none of the
code. It handles all special cases that I've tested correctly.

The overall technique consists of double-buffering the joint matrix and
morph weights buffers, as most of the previous attempts to solve this
problem did. The process is generally straightforward. Note that, to
avoid regressing the ability of mesh extraction, skin extraction, and
morph target extraction to run in parallel, I had to add a new system to
rendering, `set_mesh_motion_vector_flags`. The comment there explains
the details; it generally runs very quickly.

I've tested this with modified versions of the `animated_fox`,
`morph_targets`, and `many_foxes` examples that add TAA, and the patch
works. To avoid bloating those examples, I didn't add switches for TAA
to them.

Addresses points (1) and (2) of #8423.

## Changelog

### Fixed

* Motion vectors, and therefore TAA, are now supported for meshes with
skins and/or morph targets.
											
										
										
											2024-05-31 17:02:28 +00:00
+								            current_buffer: RawBufferVec::new(BufferUsages::UNIFORM),
 								            prev_buffer: RawBufferVec::new(BufferUsages::UNIFORM),
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								        }
 								    }
 								}
 								pub fn prepare_skins(
 								    render_device: Res<RenderDevice>,
 								    render_queue: Res<RenderQueue>,
-												Implement motion vectors and TAA for skinned meshes and meshes with morph targets. (#13572)

This is a revamped equivalent to #9902, though it shares none of the
code. It handles all special cases that I've tested correctly.

The overall technique consists of double-buffering the joint matrix and
morph weights buffers, as most of the previous attempts to solve this
problem did. The process is generally straightforward. Note that, to
avoid regressing the ability of mesh extraction, skin extraction, and
morph target extraction to run in parallel, I had to add a new system to
rendering, `set_mesh_motion_vector_flags`. The comment there explains
the details; it generally runs very quickly.

I've tested this with modified versions of the `animated_fox`,
`morph_targets`, and `many_foxes` examples that add TAA, and the patch
works. To avoid bloating those examples, I didn't add switches for TAA
to them.

Addresses points (1) and (2) of #8423.

## Changelog

### Fixed

* Motion vectors, and therefore TAA, are now supported for meshes with
skins and/or morph targets.
											
										
										
											2024-05-31 17:02:28 +00:00
+								    mut uniform: ResMut<SkinUniforms>,
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								) {
-												Implement motion vectors and TAA for skinned meshes and meshes with morph targets. (#13572)

This is a revamped equivalent to #9902, though it shares none of the
code. It handles all special cases that I've tested correctly.

The overall technique consists of double-buffering the joint matrix and
morph weights buffers, as most of the previous attempts to solve this
problem did. The process is generally straightforward. Note that, to
avoid regressing the ability of mesh extraction, skin extraction, and
morph target extraction to run in parallel, I had to add a new system to
rendering, `set_mesh_motion_vector_flags`. The comment there explains
the details; it generally runs very quickly.

I've tested this with modified versions of the `animated_fox`,
`morph_targets`, and `many_foxes` examples that add TAA, and the patch
works. To avoid bloating those examples, I didn't add switches for TAA
to them.

Addresses points (1) and (2) of #8423.

## Changelog

### Fixed

* Motion vectors, and therefore TAA, are now supported for meshes with
skins and/or morph targets.
											
										
										
											2024-05-31 17:02:28 +00:00
+								    if uniform.current_buffer.is_empty() {
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								        return;
 								    }
-												Implement motion vectors and TAA for skinned meshes and meshes with morph targets. (#13572)

This is a revamped equivalent to #9902, though it shares none of the
code. It handles all special cases that I've tested correctly.

The overall technique consists of double-buffering the joint matrix and
morph weights buffers, as most of the previous attempts to solve this
problem did. The process is generally straightforward. Note that, to
avoid regressing the ability of mesh extraction, skin extraction, and
morph target extraction to run in parallel, I had to add a new system to
rendering, `set_mesh_motion_vector_flags`. The comment there explains
the details; it generally runs very quickly.

I've tested this with modified versions of the `animated_fox`,
`morph_targets`, and `many_foxes` examples that add TAA, and the patch
works. To avoid bloating those examples, I didn't add switches for TAA
to them.

Addresses points (1) and (2) of #8423.

## Changelog

### Fixed

* Motion vectors, and therefore TAA, are now supported for meshes with
skins and/or morph targets.
											
										
										
											2024-05-31 17:02:28 +00:00
+								    let len = uniform.current_buffer.len();
 								    uniform.current_buffer.reserve(len, &render_device);
 								    uniform
 								        .current_buffer
 								        .write_buffer(&render_device, &render_queue);
 								    // We don't need to write `uniform.prev_buffer` because we already wrote it
 								    // last frame, and the data should still be on the GPU.
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								}
 								// Notes on implementation:
 								// We define the uniform binding as an array<mat4x4<f32>, N> in the shader,
 								// where N is the maximum number of Mat4s we can fit in the uniform binding,
 								// which may be as little as 16kB or 64kB. But, we may not need all N.
 								// We may only need, for example, 10.
 								//
 								// If we used uniform buffers ‘normally’ then we would have to write a full
 								// binding of data for each dynamic offset binding, which is wasteful, makes
 								// the buffer much larger than it needs to be, and uses more memory bandwidth
 								// to transfer the data, which then costs frame time So @superdump came up
 								// with this design: just bind data at the specified offset and interpret
 								// the data at that offset as an array<T, N> regardless of what is there.
 								//
 								// So instead of writing N Mat4s when you only need 10, you write 10, and
 								// then pad up to the next dynamic offset alignment. Then write the next.
 								// And for the last dynamic offset binding, make sure there is a full binding
 								// of data after it so that the buffer is of size
 								// `last dynamic offset` + `array<mat4x4<f32>>`.
 								//
 								// Then when binding the first dynamic offset, the first 10 entries in the array
 								// are what you expect, but if you read the 11th you’re reading ‘invalid’ data
 								// which could be padding or could be from the next binding.
 								//
 								// In this way, we can pack ‘variable sized arrays’ into uniform buffer bindings
 								// which normally only support fixed size arrays. You just have to make sure
 								// in the shader that you only read the values that are valid for that binding.
 								pub fn extract_skins(
-												Implement motion vectors and TAA for skinned meshes and meshes with morph targets. (#13572)

This is a revamped equivalent to #9902, though it shares none of the
code. It handles all special cases that I've tested correctly.

The overall technique consists of double-buffering the joint matrix and
morph weights buffers, as most of the previous attempts to solve this
problem did. The process is generally straightforward. Note that, to
avoid regressing the ability of mesh extraction, skin extraction, and
morph target extraction to run in parallel, I had to add a new system to
rendering, `set_mesh_motion_vector_flags`. The comment there explains
the details; it generally runs very quickly.

I've tested this with modified versions of the `animated_fox`,
`morph_targets`, and `many_foxes` examples that add TAA, and the patch
works. To avoid bloating those examples, I didn't add switches for TAA
to them.

Addresses points (1) and (2) of #8423.

## Changelog

### Fixed

* Motion vectors, and therefore TAA, are now supported for meshes with
skins and/or morph targets.
											
										
										
											2024-05-31 17:02:28 +00:00
+								    skin_indices: ResMut<SkinIndices>,
 								    uniform: ResMut<SkinUniforms>,
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								    query: Extract<Query<(Entity, &ViewVisibility, &SkinnedMesh)>>,
 								    inverse_bindposes: Extract<Res<Assets<SkinnedMeshInverseBindposes>>>,
 								    joints: Extract<Query<&GlobalTransform>>,
 								) {
-												Implement motion vectors and TAA for skinned meshes and meshes with morph targets. (#13572)

This is a revamped equivalent to #9902, though it shares none of the
code. It handles all special cases that I've tested correctly.

The overall technique consists of double-buffering the joint matrix and
morph weights buffers, as most of the previous attempts to solve this
problem did. The process is generally straightforward. Note that, to
avoid regressing the ability of mesh extraction, skin extraction, and
morph target extraction to run in parallel, I had to add a new system to
rendering, `set_mesh_motion_vector_flags`. The comment there explains
the details; it generally runs very quickly.

I've tested this with modified versions of the `animated_fox`,
`morph_targets`, and `many_foxes` examples that add TAA, and the patch
works. To avoid bloating those examples, I didn't add switches for TAA
to them.

Addresses points (1) and (2) of #8423.

## Changelog

### Fixed

* Motion vectors, and therefore TAA, are now supported for meshes with
skins and/or morph targets.
											
										
										
											2024-05-31 17:02:28 +00:00
+								    // Borrow check workaround.
 								    let (skin_indices, uniform) = (skin_indices.into_inner(), uniform.into_inner());
 								    // Swap buffers. We need to keep the previous frame's buffer around for the
 								    // purposes of motion vector computation.
 								    mem::swap(&mut skin_indices.current, &mut skin_indices.prev);
 								    mem::swap(&mut uniform.current_buffer, &mut uniform.prev_buffer);
 								    skin_indices.current.clear();
 								    uniform.current_buffer.clear();
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								    let mut last_start = 0;
 								    // PERF: This can be expensive, can we move this to prepare?
 								    for (entity, view_visibility, skin) in &query {
 								        if !view_visibility.get() {
 								            continue;
 								        }
-												Implement motion vectors and TAA for skinned meshes and meshes with morph targets. (#13572)

This is a revamped equivalent to #9902, though it shares none of the
code. It handles all special cases that I've tested correctly.

The overall technique consists of double-buffering the joint matrix and
morph weights buffers, as most of the previous attempts to solve this
problem did. The process is generally straightforward. Note that, to
avoid regressing the ability of mesh extraction, skin extraction, and
morph target extraction to run in parallel, I had to add a new system to
rendering, `set_mesh_motion_vector_flags`. The comment there explains
the details; it generally runs very quickly.

I've tested this with modified versions of the `animated_fox`,
`morph_targets`, and `many_foxes` examples that add TAA, and the patch
works. To avoid bloating those examples, I didn't add switches for TAA
to them.

Addresses points (1) and (2) of #8423.

## Changelog

### Fixed

* Motion vectors, and therefore TAA, are now supported for meshes with
skins and/or morph targets.
											
										
										
											2024-05-31 17:02:28 +00:00
+								        let buffer = &mut uniform.current_buffer;
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								        let Some(inverse_bindposes) = inverse_bindposes.get(&skin.inverse_bindposes) else {
 								            continue;
 								        };
 								        let start = buffer.len();
 								        let target = start + skin.joints.len().min(MAX_JOINTS);
 								        buffer.extend(
 								            joints
 								                .iter_many(&skin.joints)
 								                .zip(inverse_bindposes.iter())
 								                .take(MAX_JOINTS)
 								                .map(|(joint, bindpose)| joint.affine() * *bindpose),
 								        );
 								        // iter_many will skip any failed fetches. This will cause it to assign the wrong bones,
 								        // so just bail by truncating to the start.
 								        if buffer.len() != target {
 								            buffer.truncate(start);
 								            continue;
 								        }
 								        last_start = last_start.max(start);
 								        // Pad to 256 byte alignment
 								        while buffer.len() % 4 != 0 {
 								            buffer.push(Mat4::ZERO);
 								        }
-												Use EntityHashMap<Entity, T> for render world entity storage for better performance (#9903)

# Objective

- Improve rendering performance, particularly by avoiding the large
system commands costs of using the ECS in the way that the render world
does.

## Solution

- Define `EntityHasher` that calculates a hash from the
`Entity.to_bits()` by `i | (i.wrapping_mul(0x517cc1b727220a95) << 32)`.
`0x517cc1b727220a95` is something like `u64::MAX / N` for N that gives a
value close to π and that works well for hashing. Thanks for @SkiFire13
for the suggestion and to @nicopap for alternative suggestions and
discussion. This approach comes from `rustc-hash` (a.k.a. `FxHasher`)
with some tweaks for the case of hashing an `Entity`. `FxHasher` and
`SeaHasher` were also tested but were significantly slower.
- Define `EntityHashMap` type that uses the `EntityHashser`
- Use `EntityHashMap<Entity, T>` for render world entity storage,
including:
- `RenderMaterialInstances` - contains the `AssetId<M>` of the material
associated with the entity. Also for 2D.
- `RenderMeshInstances` - contains mesh transforms, flags and properties
about mesh entities. Also for 2D.
- `SkinIndices` and `MorphIndices` - contains the skin and morph index
for an entity, respectively
  - `ExtractedSprites`
  - `ExtractedUiNodes`

## Benchmarks

All benchmarks have been conducted on an M1 Max connected to AC power.
The tests are run for 1500 frames. The 1000th frame is captured for
comparison to check for visual regressions. There were none.

### 2D Meshes

`bevymark --benchmark --waves 160 --per-wave 1000 --mode mesh2d`

#### `--ordered-z`

This test spawns the 2D meshes with z incrementing back to front, which
is the ideal arrangement allocation order as it matches the sorted
render order which means lookups have a high cache hit rate.

<img width="1112" alt="Screenshot 2023-09-27 at 07 50 45"
src="https://github.com/bevyengine/bevy/assets/302146/e140bc98-7091-4a3b-8ae1-ab75d16d2ccb">

-39.1% median frame time.

#### Random

This test spawns the 2D meshes with random z. This not only makes the
batching and transparent 2D pass lookups get a lot of cache misses, it
also currently means that the meshes are almost certain to not be
batchable.

<img width="1108" alt="Screenshot 2023-09-27 at 07 51 28"
src="https://github.com/bevyengine/bevy/assets/302146/29c2e813-645a-43ce-982a-55df4bf7d8c4">

-7.2% median frame time.

### 3D Meshes

`many_cubes --benchmark`

<img width="1112" alt="Screenshot 2023-09-27 at 07 51 57"
src="https://github.com/bevyengine/bevy/assets/302146/1a729673-3254-4e2a-9072-55e27c69f0fc">

-7.7% median frame time.

### Sprites

**NOTE: On `main` sprites are using `SparseSet<Entity, T>`!**

`bevymark --benchmark --waves 160 --per-wave 1000 --mode sprite`

#### `--ordered-z`

This test spawns the sprites with z incrementing back to front, which is
the ideal arrangement allocation order as it matches the sorted render
order which means lookups have a high cache hit rate.

<img width="1116" alt="Screenshot 2023-09-27 at 07 52 31"
src="https://github.com/bevyengine/bevy/assets/302146/bc8eab90-e375-4d31-b5cd-f55f6f59ab67">

+13.0% median frame time.

#### Random

This test spawns the sprites with random z. This makes the batching and
transparent 2D pass lookups get a lot of cache misses.

<img width="1109" alt="Screenshot 2023-09-27 at 07 53 01"
src="https://github.com/bevyengine/bevy/assets/302146/22073f5d-99a7-49b0-9584-d3ac3eac3033">

+0.6% median frame time.

### UI

**NOTE: On `main` UI is using `SparseSet<Entity, T>`!**

`many_buttons`

<img width="1111" alt="Screenshot 2023-09-27 at 07 53 26"
src="https://github.com/bevyengine/bevy/assets/302146/66afd56d-cbe4-49e7-8b64-2f28f6043d85">

+15.1% median frame time.

## Alternatives

- Cart originally suggested trying out `SparseSet<Entity, T>` and indeed
that is slightly faster under ideal conditions. However,
`PassHashMap<Entity, T>` has better worst case performance when data is
randomly distributed, rather than in sorted render order, and does not
have the worst case memory usage that `SparseSet`'s dense `Vec<usize>`
that maps from the `Entity` index to sparse index into `Vec<T>`. This
dense `Vec` has to be as large as the largest Entity index used with the
`SparseSet`.
- I also tested `PassHashMap<u32, T>`, intending to use `Entity.index()`
as the key, but this proved to sometimes be slower and mostly no
different.
- The only outstanding approach that has not been implemented and tested
is to _not_ clear the render world of its entities each frame. That has
its own problems, though they could perhaps be solved.
- Performance-wise, if the entities and their component data were not
cleared, then they would incur table moves on spawn, and should not
thereafter, rather just their component data would be overwritten.
Ideally we would have a neat way of either updating data in-place via
`&mut T` queries, or inserting components if not present. This would
likely be quite cumbersome to have to remember to do everywhere, but
perhaps it only needs to be done in the more performance-sensitive
systems.
- The main problem to solve however is that we want to both maintain a
mapping between main world entities and render world entities, be able
to run the render app and world in parallel with the main app and world
for pipelined rendering, and at the same time be able to spawn entities
in the render world in such a way that those Entity ids do not collide
with those spawned in the main world. This is potentially quite
solvable, but could well be a lot of ECS work to do it in a way that
makes sense.

---

## Changelog

- Changed: Component data for entities to be drawn are no longer stored
on entities in the render world. Instead, data is stored in a
`EntityHashMap<Entity, T>` in various resources. This brings significant
performance benefits due to the way the render app clears entities every
frame. Resources of most interest are `RenderMeshInstances` and
`RenderMaterialInstances`, and their 2D counterparts.

## Migration Guide

Previously the render app extracted mesh entities and their component
data from the main world and stored them as entities and components in
the render world. Now they are extracted into essentially
`EntityHashMap<Entity, T>` where `T` are structs containing an
appropriate group of data. This means that while extract set systems
will continue to run extract queries against the main world they will
store their data in hash maps. Also, systems in later sets will either
need to look up entities in the available resources such as
`RenderMeshInstances`, or maintain their own `EntityHashMap<Entity, T>`
for their own data.

Before:
```rust
fn queue_custom(
    material_meshes: Query<(Entity, &MeshTransforms, &Handle<Mesh>), With<InstanceMaterialData>>,
) {
    ...
    for (entity, mesh_transforms, mesh_handle) in &material_meshes {
        ...
    }
}
```

After:
```rust
fn queue_custom(
    render_mesh_instances: Res<RenderMeshInstances>,
    instance_entities: Query<Entity, With<InstanceMaterialData>>,
) {
    ...
    for entity in &instance_entities {
        let Some(mesh_instance) = render_mesh_instances.get(&entity) else { continue; };
        // The mesh handle in `AssetId<Mesh>` form, and the `MeshTransforms` can now
        // be found in `mesh_instance` which is a `RenderMeshInstance`
        ...
    }
}
```

---------

Co-authored-by: robtfm <50659922+robtfm@users.noreply.github.com>
											
										
										
											2023-09-27 08:28:28 +00:00
-												Implement motion vectors and TAA for skinned meshes and meshes with morph targets. (#13572)

This is a revamped equivalent to #9902, though it shares none of the
code. It handles all special cases that I've tested correctly.

The overall technique consists of double-buffering the joint matrix and
morph weights buffers, as most of the previous attempts to solve this
problem did. The process is generally straightforward. Note that, to
avoid regressing the ability of mesh extraction, skin extraction, and
morph target extraction to run in parallel, I had to add a new system to
rendering, `set_mesh_motion_vector_flags`. The comment there explains
the details; it generally runs very quickly.

I've tested this with modified versions of the `animated_fox`,
`morph_targets`, and `many_foxes` examples that add TAA, and the patch
works. To avoid bloating those examples, I didn't add switches for TAA
to them.

Addresses points (1) and (2) of #8423.

## Changelog

### Fixed

* Motion vectors, and therefore TAA, are now supported for meshes with
skins and/or morph targets.
											
										
										
											2024-05-31 17:02:28 +00:00
+								        skin_indices.current.insert(entity, SkinIndex::new(start));
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								    }
 								    // Pad out the buffer to ensure that there's enough space for bindings
-												Implement motion vectors and TAA for skinned meshes and meshes with morph targets. (#13572)

This is a revamped equivalent to #9902, though it shares none of the
code. It handles all special cases that I've tested correctly.

The overall technique consists of double-buffering the joint matrix and
morph weights buffers, as most of the previous attempts to solve this
problem did. The process is generally straightforward. Note that, to
avoid regressing the ability of mesh extraction, skin extraction, and
morph target extraction to run in parallel, I had to add a new system to
rendering, `set_mesh_motion_vector_flags`. The comment there explains
the details; it generally runs very quickly.

I've tested this with modified versions of the `animated_fox`,
`morph_targets`, and `many_foxes` examples that add TAA, and the patch
works. To avoid bloating those examples, I didn't add switches for TAA
to them.

Addresses points (1) and (2) of #8423.

## Changelog

### Fixed

* Motion vectors, and therefore TAA, are now supported for meshes with
skins and/or morph targets.
											
										
										
											2024-05-31 17:02:28 +00:00
+								    while uniform.current_buffer.len() - last_start < MAX_JOINTS {
 								        uniform.current_buffer.push(Mat4::ZERO);
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								    }
-												Use EntityHashMap<Entity, T> for render world entity storage for better performance (#9903)

# Objective

- Improve rendering performance, particularly by avoiding the large
system commands costs of using the ECS in the way that the render world
does.

## Solution

- Define `EntityHasher` that calculates a hash from the
`Entity.to_bits()` by `i | (i.wrapping_mul(0x517cc1b727220a95) << 32)`.
`0x517cc1b727220a95` is something like `u64::MAX / N` for N that gives a
value close to π and that works well for hashing. Thanks for @SkiFire13
for the suggestion and to @nicopap for alternative suggestions and
discussion. This approach comes from `rustc-hash` (a.k.a. `FxHasher`)
with some tweaks for the case of hashing an `Entity`. `FxHasher` and
`SeaHasher` were also tested but were significantly slower.
- Define `EntityHashMap` type that uses the `EntityHashser`
- Use `EntityHashMap<Entity, T>` for render world entity storage,
including:
- `RenderMaterialInstances` - contains the `AssetId<M>` of the material
associated with the entity. Also for 2D.
- `RenderMeshInstances` - contains mesh transforms, flags and properties
about mesh entities. Also for 2D.
- `SkinIndices` and `MorphIndices` - contains the skin and morph index
for an entity, respectively
  - `ExtractedSprites`
  - `ExtractedUiNodes`

## Benchmarks

All benchmarks have been conducted on an M1 Max connected to AC power.
The tests are run for 1500 frames. The 1000th frame is captured for
comparison to check for visual regressions. There were none.

### 2D Meshes

`bevymark --benchmark --waves 160 --per-wave 1000 --mode mesh2d`

#### `--ordered-z`

This test spawns the 2D meshes with z incrementing back to front, which
is the ideal arrangement allocation order as it matches the sorted
render order which means lookups have a high cache hit rate.

<img width="1112" alt="Screenshot 2023-09-27 at 07 50 45"
src="https://github.com/bevyengine/bevy/assets/302146/e140bc98-7091-4a3b-8ae1-ab75d16d2ccb">

-39.1% median frame time.

#### Random

This test spawns the 2D meshes with random z. This not only makes the
batching and transparent 2D pass lookups get a lot of cache misses, it
also currently means that the meshes are almost certain to not be
batchable.

<img width="1108" alt="Screenshot 2023-09-27 at 07 51 28"
src="https://github.com/bevyengine/bevy/assets/302146/29c2e813-645a-43ce-982a-55df4bf7d8c4">

-7.2% median frame time.

### 3D Meshes

`many_cubes --benchmark`

<img width="1112" alt="Screenshot 2023-09-27 at 07 51 57"
src="https://github.com/bevyengine/bevy/assets/302146/1a729673-3254-4e2a-9072-55e27c69f0fc">

-7.7% median frame time.

### Sprites

**NOTE: On `main` sprites are using `SparseSet<Entity, T>`!**

`bevymark --benchmark --waves 160 --per-wave 1000 --mode sprite`

#### `--ordered-z`

This test spawns the sprites with z incrementing back to front, which is
the ideal arrangement allocation order as it matches the sorted render
order which means lookups have a high cache hit rate.

<img width="1116" alt="Screenshot 2023-09-27 at 07 52 31"
src="https://github.com/bevyengine/bevy/assets/302146/bc8eab90-e375-4d31-b5cd-f55f6f59ab67">

+13.0% median frame time.

#### Random

This test spawns the sprites with random z. This makes the batching and
transparent 2D pass lookups get a lot of cache misses.

<img width="1109" alt="Screenshot 2023-09-27 at 07 53 01"
src="https://github.com/bevyengine/bevy/assets/302146/22073f5d-99a7-49b0-9584-d3ac3eac3033">

+0.6% median frame time.

### UI

**NOTE: On `main` UI is using `SparseSet<Entity, T>`!**

`many_buttons`

<img width="1111" alt="Screenshot 2023-09-27 at 07 53 26"
src="https://github.com/bevyengine/bevy/assets/302146/66afd56d-cbe4-49e7-8b64-2f28f6043d85">

+15.1% median frame time.

## Alternatives

- Cart originally suggested trying out `SparseSet<Entity, T>` and indeed
that is slightly faster under ideal conditions. However,
`PassHashMap<Entity, T>` has better worst case performance when data is
randomly distributed, rather than in sorted render order, and does not
have the worst case memory usage that `SparseSet`'s dense `Vec<usize>`
that maps from the `Entity` index to sparse index into `Vec<T>`. This
dense `Vec` has to be as large as the largest Entity index used with the
`SparseSet`.
- I also tested `PassHashMap<u32, T>`, intending to use `Entity.index()`
as the key, but this proved to sometimes be slower and mostly no
different.
- The only outstanding approach that has not been implemented and tested
is to _not_ clear the render world of its entities each frame. That has
its own problems, though they could perhaps be solved.
- Performance-wise, if the entities and their component data were not
cleared, then they would incur table moves on spawn, and should not
thereafter, rather just their component data would be overwritten.
Ideally we would have a neat way of either updating data in-place via
`&mut T` queries, or inserting components if not present. This would
likely be quite cumbersome to have to remember to do everywhere, but
perhaps it only needs to be done in the more performance-sensitive
systems.
- The main problem to solve however is that we want to both maintain a
mapping between main world entities and render world entities, be able
to run the render app and world in parallel with the main app and world
for pipelined rendering, and at the same time be able to spawn entities
in the render world in such a way that those Entity ids do not collide
with those spawned in the main world. This is potentially quite
solvable, but could well be a lot of ECS work to do it in a way that
makes sense.

---

## Changelog

- Changed: Component data for entities to be drawn are no longer stored
on entities in the render world. Instead, data is stored in a
`EntityHashMap<Entity, T>` in various resources. This brings significant
performance benefits due to the way the render app clears entities every
frame. Resources of most interest are `RenderMeshInstances` and
`RenderMaterialInstances`, and their 2D counterparts.

## Migration Guide

Previously the render app extracted mesh entities and their component
data from the main world and stored them as entities and components in
the render world. Now they are extracted into essentially
`EntityHashMap<Entity, T>` where `T` are structs containing an
appropriate group of data. This means that while extract set systems
will continue to run extract queries against the main world they will
store their data in hash maps. Also, systems in later sets will either
need to look up entities in the available resources such as
`RenderMeshInstances`, or maintain their own `EntityHashMap<Entity, T>`
for their own data.

Before:
```rust
fn queue_custom(
    material_meshes: Query<(Entity, &MeshTransforms, &Handle<Mesh>), With<InstanceMaterialData>>,
) {
    ...
    for (entity, mesh_transforms, mesh_handle) in &material_meshes {
        ...
    }
}
```

After:
```rust
fn queue_custom(
    render_mesh_instances: Res<RenderMeshInstances>,
    instance_entities: Query<Entity, With<InstanceMaterialData>>,
) {
    ...
    for entity in &instance_entities {
        let Some(mesh_instance) = render_mesh_instances.get(&entity) else { continue; };
        // The mesh handle in `AssetId<Mesh>` form, and the `MeshTransforms` can now
        // be found in `mesh_instance` which is a `RenderMeshInstance`
        ...
    }
}
```

---------

Co-authored-by: robtfm <50659922+robtfm@users.noreply.github.com>
											
										
										
											2023-09-27 08:28:28 +00:00
+								}
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
-												Use EntityHashMap<Entity, T> for render world entity storage for better performance (#9903)

# Objective

- Improve rendering performance, particularly by avoiding the large
system commands costs of using the ECS in the way that the render world
does.

## Solution

- Define `EntityHasher` that calculates a hash from the
`Entity.to_bits()` by `i | (i.wrapping_mul(0x517cc1b727220a95) << 32)`.
`0x517cc1b727220a95` is something like `u64::MAX / N` for N that gives a
value close to π and that works well for hashing. Thanks for @SkiFire13
for the suggestion and to @nicopap for alternative suggestions and
discussion. This approach comes from `rustc-hash` (a.k.a. `FxHasher`)
with some tweaks for the case of hashing an `Entity`. `FxHasher` and
`SeaHasher` were also tested but were significantly slower.
- Define `EntityHashMap` type that uses the `EntityHashser`
- Use `EntityHashMap<Entity, T>` for render world entity storage,
including:
- `RenderMaterialInstances` - contains the `AssetId<M>` of the material
associated with the entity. Also for 2D.
- `RenderMeshInstances` - contains mesh transforms, flags and properties
about mesh entities. Also for 2D.
- `SkinIndices` and `MorphIndices` - contains the skin and morph index
for an entity, respectively
  - `ExtractedSprites`
  - `ExtractedUiNodes`

## Benchmarks

All benchmarks have been conducted on an M1 Max connected to AC power.
The tests are run for 1500 frames. The 1000th frame is captured for
comparison to check for visual regressions. There were none.

### 2D Meshes

`bevymark --benchmark --waves 160 --per-wave 1000 --mode mesh2d`

#### `--ordered-z`

This test spawns the 2D meshes with z incrementing back to front, which
is the ideal arrangement allocation order as it matches the sorted
render order which means lookups have a high cache hit rate.

<img width="1112" alt="Screenshot 2023-09-27 at 07 50 45"
src="https://github.com/bevyengine/bevy/assets/302146/e140bc98-7091-4a3b-8ae1-ab75d16d2ccb">

-39.1% median frame time.

#### Random

This test spawns the 2D meshes with random z. This not only makes the
batching and transparent 2D pass lookups get a lot of cache misses, it
also currently means that the meshes are almost certain to not be
batchable.

<img width="1108" alt="Screenshot 2023-09-27 at 07 51 28"
src="https://github.com/bevyengine/bevy/assets/302146/29c2e813-645a-43ce-982a-55df4bf7d8c4">

-7.2% median frame time.

### 3D Meshes

`many_cubes --benchmark`

<img width="1112" alt="Screenshot 2023-09-27 at 07 51 57"
src="https://github.com/bevyengine/bevy/assets/302146/1a729673-3254-4e2a-9072-55e27c69f0fc">

-7.7% median frame time.

### Sprites

**NOTE: On `main` sprites are using `SparseSet<Entity, T>`!**

`bevymark --benchmark --waves 160 --per-wave 1000 --mode sprite`

#### `--ordered-z`

This test spawns the sprites with z incrementing back to front, which is
the ideal arrangement allocation order as it matches the sorted render
order which means lookups have a high cache hit rate.

<img width="1116" alt="Screenshot 2023-09-27 at 07 52 31"
src="https://github.com/bevyengine/bevy/assets/302146/bc8eab90-e375-4d31-b5cd-f55f6f59ab67">

+13.0% median frame time.

#### Random

This test spawns the sprites with random z. This makes the batching and
transparent 2D pass lookups get a lot of cache misses.

<img width="1109" alt="Screenshot 2023-09-27 at 07 53 01"
src="https://github.com/bevyengine/bevy/assets/302146/22073f5d-99a7-49b0-9584-d3ac3eac3033">

+0.6% median frame time.

### UI

**NOTE: On `main` UI is using `SparseSet<Entity, T>`!**

`many_buttons`

<img width="1111" alt="Screenshot 2023-09-27 at 07 53 26"
src="https://github.com/bevyengine/bevy/assets/302146/66afd56d-cbe4-49e7-8b64-2f28f6043d85">

+15.1% median frame time.

## Alternatives

- Cart originally suggested trying out `SparseSet<Entity, T>` and indeed
that is slightly faster under ideal conditions. However,
`PassHashMap<Entity, T>` has better worst case performance when data is
randomly distributed, rather than in sorted render order, and does not
have the worst case memory usage that `SparseSet`'s dense `Vec<usize>`
that maps from the `Entity` index to sparse index into `Vec<T>`. This
dense `Vec` has to be as large as the largest Entity index used with the
`SparseSet`.
- I also tested `PassHashMap<u32, T>`, intending to use `Entity.index()`
as the key, but this proved to sometimes be slower and mostly no
different.
- The only outstanding approach that has not been implemented and tested
is to _not_ clear the render world of its entities each frame. That has
its own problems, though they could perhaps be solved.
- Performance-wise, if the entities and their component data were not
cleared, then they would incur table moves on spawn, and should not
thereafter, rather just their component data would be overwritten.
Ideally we would have a neat way of either updating data in-place via
`&mut T` queries, or inserting components if not present. This would
likely be quite cumbersome to have to remember to do everywhere, but
perhaps it only needs to be done in the more performance-sensitive
systems.
- The main problem to solve however is that we want to both maintain a
mapping between main world entities and render world entities, be able
to run the render app and world in parallel with the main app and world
for pipelined rendering, and at the same time be able to spawn entities
in the render world in such a way that those Entity ids do not collide
with those spawned in the main world. This is potentially quite
solvable, but could well be a lot of ECS work to do it in a way that
makes sense.

---

## Changelog

- Changed: Component data for entities to be drawn are no longer stored
on entities in the render world. Instead, data is stored in a
`EntityHashMap<Entity, T>` in various resources. This brings significant
performance benefits due to the way the render app clears entities every
frame. Resources of most interest are `RenderMeshInstances` and
`RenderMaterialInstances`, and their 2D counterparts.

## Migration Guide

Previously the render app extracted mesh entities and their component
data from the main world and stored them as entities and components in
the render world. Now they are extracted into essentially
`EntityHashMap<Entity, T>` where `T` are structs containing an
appropriate group of data. This means that while extract set systems
will continue to run extract queries against the main world they will
store their data in hash maps. Also, systems in later sets will either
need to look up entities in the available resources such as
`RenderMeshInstances`, or maintain their own `EntityHashMap<Entity, T>`
for their own data.

Before:
```rust
fn queue_custom(
    material_meshes: Query<(Entity, &MeshTransforms, &Handle<Mesh>), With<InstanceMaterialData>>,
) {
    ...
    for (entity, mesh_transforms, mesh_handle) in &material_meshes {
        ...
    }
}
```

After:
```rust
fn queue_custom(
    render_mesh_instances: Res<RenderMeshInstances>,
    instance_entities: Query<Entity, With<InstanceMaterialData>>,
) {
    ...
    for entity in &instance_entities {
        let Some(mesh_instance) = render_mesh_instances.get(&entity) else { continue; };
        // The mesh handle in `AssetId<Mesh>` form, and the `MeshTransforms` can now
        // be found in `mesh_instance` which is a `RenderMeshInstance`
        ...
    }
}
```

---------

Co-authored-by: robtfm <50659922+robtfm@users.noreply.github.com>
											
										
										
											2023-09-27 08:28:28 +00:00
+								// NOTE: The skinned joints uniform buffer has to be bound at a dynamic offset per
 								// entity and so cannot currently be batched.
 								pub fn no_automatic_skin_batching(
 								    mut commands: Commands,
 								    query: Query<Entity, (With<SkinnedMesh>, Without<NoAutomaticBatching>)>,
 								) {
 								    for entity in &query {
-												try_insert NoAutomaticBatching (#12396)

# Objective

fix occasional crash from commands.insert when quickly spawning and
despawning skinned/morphed meshes
 
## Solution

use `try_insert` instead of `insert`. if the entity is deleted we don't
mind failing to add the `NoAutomaticBatching` marker.
											
										
										
											2024-03-10 02:14:33 +00:00
+								        commands.entity(entity).try_insert(NoAutomaticBatching);
-												Use EntityHashMap<Entity, T> for render world entity storage for better performance (#9903)

# Objective

- Improve rendering performance, particularly by avoiding the large
system commands costs of using the ECS in the way that the render world
does.

## Solution

- Define `EntityHasher` that calculates a hash from the
`Entity.to_bits()` by `i | (i.wrapping_mul(0x517cc1b727220a95) << 32)`.
`0x517cc1b727220a95` is something like `u64::MAX / N` for N that gives a
value close to π and that works well for hashing. Thanks for @SkiFire13
for the suggestion and to @nicopap for alternative suggestions and
discussion. This approach comes from `rustc-hash` (a.k.a. `FxHasher`)
with some tweaks for the case of hashing an `Entity`. `FxHasher` and
`SeaHasher` were also tested but were significantly slower.
- Define `EntityHashMap` type that uses the `EntityHashser`
- Use `EntityHashMap<Entity, T>` for render world entity storage,
including:
- `RenderMaterialInstances` - contains the `AssetId<M>` of the material
associated with the entity. Also for 2D.
- `RenderMeshInstances` - contains mesh transforms, flags and properties
about mesh entities. Also for 2D.
- `SkinIndices` and `MorphIndices` - contains the skin and morph index
for an entity, respectively
  - `ExtractedSprites`
  - `ExtractedUiNodes`

## Benchmarks

All benchmarks have been conducted on an M1 Max connected to AC power.
The tests are run for 1500 frames. The 1000th frame is captured for
comparison to check for visual regressions. There were none.

### 2D Meshes

`bevymark --benchmark --waves 160 --per-wave 1000 --mode mesh2d`

#### `--ordered-z`

This test spawns the 2D meshes with z incrementing back to front, which
is the ideal arrangement allocation order as it matches the sorted
render order which means lookups have a high cache hit rate.

<img width="1112" alt="Screenshot 2023-09-27 at 07 50 45"
src="https://github.com/bevyengine/bevy/assets/302146/e140bc98-7091-4a3b-8ae1-ab75d16d2ccb">

-39.1% median frame time.

#### Random

This test spawns the 2D meshes with random z. This not only makes the
batching and transparent 2D pass lookups get a lot of cache misses, it
also currently means that the meshes are almost certain to not be
batchable.

<img width="1108" alt="Screenshot 2023-09-27 at 07 51 28"
src="https://github.com/bevyengine/bevy/assets/302146/29c2e813-645a-43ce-982a-55df4bf7d8c4">

-7.2% median frame time.

### 3D Meshes

`many_cubes --benchmark`

<img width="1112" alt="Screenshot 2023-09-27 at 07 51 57"
src="https://github.com/bevyengine/bevy/assets/302146/1a729673-3254-4e2a-9072-55e27c69f0fc">

-7.7% median frame time.

### Sprites

**NOTE: On `main` sprites are using `SparseSet<Entity, T>`!**

`bevymark --benchmark --waves 160 --per-wave 1000 --mode sprite`

#### `--ordered-z`

This test spawns the sprites with z incrementing back to front, which is
the ideal arrangement allocation order as it matches the sorted render
order which means lookups have a high cache hit rate.

<img width="1116" alt="Screenshot 2023-09-27 at 07 52 31"
src="https://github.com/bevyengine/bevy/assets/302146/bc8eab90-e375-4d31-b5cd-f55f6f59ab67">

+13.0% median frame time.

#### Random

This test spawns the sprites with random z. This makes the batching and
transparent 2D pass lookups get a lot of cache misses.

<img width="1109" alt="Screenshot 2023-09-27 at 07 53 01"
src="https://github.com/bevyengine/bevy/assets/302146/22073f5d-99a7-49b0-9584-d3ac3eac3033">

+0.6% median frame time.

### UI

**NOTE: On `main` UI is using `SparseSet<Entity, T>`!**

`many_buttons`

<img width="1111" alt="Screenshot 2023-09-27 at 07 53 26"
src="https://github.com/bevyengine/bevy/assets/302146/66afd56d-cbe4-49e7-8b64-2f28f6043d85">

+15.1% median frame time.

## Alternatives

- Cart originally suggested trying out `SparseSet<Entity, T>` and indeed
that is slightly faster under ideal conditions. However,
`PassHashMap<Entity, T>` has better worst case performance when data is
randomly distributed, rather than in sorted render order, and does not
have the worst case memory usage that `SparseSet`'s dense `Vec<usize>`
that maps from the `Entity` index to sparse index into `Vec<T>`. This
dense `Vec` has to be as large as the largest Entity index used with the
`SparseSet`.
- I also tested `PassHashMap<u32, T>`, intending to use `Entity.index()`
as the key, but this proved to sometimes be slower and mostly no
different.
- The only outstanding approach that has not been implemented and tested
is to _not_ clear the render world of its entities each frame. That has
its own problems, though they could perhaps be solved.
- Performance-wise, if the entities and their component data were not
cleared, then they would incur table moves on spawn, and should not
thereafter, rather just their component data would be overwritten.
Ideally we would have a neat way of either updating data in-place via
`&mut T` queries, or inserting components if not present. This would
likely be quite cumbersome to have to remember to do everywhere, but
perhaps it only needs to be done in the more performance-sensitive
systems.
- The main problem to solve however is that we want to both maintain a
mapping between main world entities and render world entities, be able
to run the render app and world in parallel with the main app and world
for pipelined rendering, and at the same time be able to spawn entities
in the render world in such a way that those Entity ids do not collide
with those spawned in the main world. This is potentially quite
solvable, but could well be a lot of ECS work to do it in a way that
makes sense.

---

## Changelog

- Changed: Component data for entities to be drawn are no longer stored
on entities in the render world. Instead, data is stored in a
`EntityHashMap<Entity, T>` in various resources. This brings significant
performance benefits due to the way the render app clears entities every
frame. Resources of most interest are `RenderMeshInstances` and
`RenderMaterialInstances`, and their 2D counterparts.

## Migration Guide

Previously the render app extracted mesh entities and their component
data from the main world and stored them as entities and components in
the render world. Now they are extracted into essentially
`EntityHashMap<Entity, T>` where `T` are structs containing an
appropriate group of data. This means that while extract set systems
will continue to run extract queries against the main world they will
store their data in hash maps. Also, systems in later sets will either
need to look up entities in the available resources such as
`RenderMeshInstances`, or maintain their own `EntityHashMap<Entity, T>`
for their own data.

Before:
```rust
fn queue_custom(
    material_meshes: Query<(Entity, &MeshTransforms, &Handle<Mesh>), With<InstanceMaterialData>>,
) {
    ...
    for (entity, mesh_transforms, mesh_handle) in &material_meshes {
        ...
    }
}
```

After:
```rust
fn queue_custom(
    render_mesh_instances: Res<RenderMeshInstances>,
    instance_entities: Query<Entity, With<InstanceMaterialData>>,
) {
    ...
    for entity in &instance_entities {
        let Some(mesh_instance) = render_mesh_instances.get(&entity) else { continue; };
        // The mesh handle in `AssetId<Mesh>` form, and the `MeshTransforms` can now
        // be found in `mesh_instance` which is a `RenderMeshInstance`
        ...
    }
}
```

---------

Co-authored-by: robtfm <50659922+robtfm@users.noreply.github.com>
											
										
										
											2023-09-27 08:28:28 +00:00
+								    }
-												Move skin code to a separate module (#9899)

# Objective

mesh.rs is infamously large. We could split off unrelated code.

## Solution

Morph targets are very similar to skinning and have their own module. We
move skinned meshes to an independent module like morph targets and give
the systems similar names.

### Open questions

Should the skinning systems and structs stay public?

---

## Migration Guide

Renamed skinning systems, resources and components:

- extract_skinned_meshes -> extract_skins
- prepare_skinned_meshes -> prepare_skins
- SkinnedMeshUniform -> SkinUniform
- SkinnedMeshJoints -> SkinIndex

---------

Co-authored-by: François <mockersf@gmail.com>
Co-authored-by: vero <email@atlasdostal.com>
											
										
										
											2023-09-25 18:40:22 +00:00
+								}