Opportunistically use dense iteration for archetypal iteration (#14049)

# Objective - currently, bevy employs sparse iteration if any of the target components in the query are stored in a sparse set. it may lead to increased cache misses in some cases, potentially impacting performance. - partial fixes #12381 ## Solution - use dense iteration when an archetype and its table have the same entity count. - to avoid introducing complicate unsafe noise, this pr only implement for `for_each ` style iteration. - added a benchmark to test performance for hybrid iteration. ## Performance ![image](https://github.com/bevyengine/bevy/assets/45868716/5cce13cf-6ff2-4861-9576-e75edc63bd46) nearly 2x win in specific scenarios, and no performance degradation in other test cases. --------- Co-authored-by: Alice Cecile <alice.i.cecile@gmail.com> Co-authored-by: Christian Hughes <9044780+ItsDoot@users.noreply.github.com>
2024-11-24 21:53:07 +00:00 · 2024-08-03 05:18:15 +08:00 · 2024-08-03 05:18:15 +08:00 · 8235daaea0
commit 8235daaea0
parent 7c80ae7313
3 changed files with 129 additions and 2 deletions
--- a/benches/benches/bevy_ecs/iteration/iter_simple_foreach_hybrid.rs
+++ b/benches/benches/bevy_ecs/iteration/iter_simple_foreach_hybrid.rs
@ -0,0 +1,43 @@
+use bevy_ecs::prelude::*;
+use rand::{prelude::SliceRandom, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+
+#[derive(Component, Copy, Clone)]
+struct TableData(f32);
+
+#[derive(Component, Copy, Clone)]
+#[component(storage = "SparseSet")]
+struct SparseData(f32);
+
+fn deterministic_rand() -> ChaCha8Rng {
+    ChaCha8Rng::seed_from_u64(42)
+}
+pub struct Benchmark<'w>(World, QueryState<(&'w mut TableData, &'w SparseData)>);
+
+impl<'w> Benchmark<'w> {
+    pub fn new() -> Self {
+        let mut world = World::new();
+
+        let mut v = vec![];
+        for _ in 0..10000 {
+            world.spawn((TableData(0.0), SparseData(0.0))).id();
+            v.push(world.spawn(TableData(0.)).id());
+        }
+
+        // by shuffling ,randomize the archetype iteration order to significantly deviate from the table order. This maximizes the loss of cache locality during archetype-based iteration.
+        v.shuffle(&mut deterministic_rand());
+        for e in v.into_iter() {
+            world.entity_mut(e).despawn();
+        }
+
+        let query = world.query::<(&mut TableData, &SparseData)>();
+        Self(world, query)
+    }
+
+    #[inline(never)]
+    pub fn run(&mut self) {
+        self.1
+            .iter_mut(&mut self.0)
+            .for_each(|(mut v1, v2)| v1.0 += v2.0)
+    }
+}
--- a/benches/benches/bevy_ecs/iteration/mod.rs
+++ b/benches/benches/bevy_ecs/iteration/mod.rs
@ -11,6 +11,7 @@ mod iter_frag_wide;
 mod iter_frag_wide_sparse;
 mod iter_simple;
 mod iter_simple_foreach;
+mod iter_simple_foreach_hybrid;
 mod iter_simple_foreach_sparse_set;
 mod iter_simple_foreach_wide;
 mod iter_simple_foreach_wide_sparse_set;
@ -71,6 +72,10 @@ fn iter_simple(c: &mut Criterion) {
        let mut bench = iter_simple_foreach_wide_sparse_set::Benchmark::new();
        b.iter(move || bench.run());
    });
+    group.bench_function("foreach_hybrid", |b| {
+        let mut bench = iter_simple_foreach_hybrid::Benchmark::new();
+        b.iter(move || bench.run());
+    });
    group.finish();
 }

--- a/crates/bevy_ecs/src/query/iter.rs
+++ b/crates/bevy_ecs/src/query/iter.rs
@ -167,6 +167,70 @@ impl<'w, 's, D: QueryData, F: QueryFilter> QueryIter<'w, 's, D, F> {
        accum
    }

+    /// Executes the equivalent of [`Iterator::fold`] over a contiguous segment
+    /// from an archetype which has the same entity count as its table.
+    ///
+    /// # Safety
+    ///  - all `indices` must be in `[0, archetype.len())`.
+    ///  - `archetype` must match D and F
+    ///  - `archetype` must have the same length with it's table.
+    ///  - Either `D::IS_DENSE` or `F::IS_DENSE` must be false.
+    #[inline]
+    pub(super) unsafe fn fold_over_dense_archetype_range<B, Func>(
+        &mut self,
+        mut accum: B,
+        func: &mut Func,
+        archetype: &'w Archetype,
+        rows: Range<usize>,
+    ) -> B
+    where
+        Func: FnMut(B, D::Item<'w>) -> B,
+    {
+        assert!(
+            rows.end <= u32::MAX as usize,
+            "TableRow is only valid up to u32::MAX"
+        );
+        let table = self.tables.get(archetype.table_id()).debug_checked_unwrap();
+
+        debug_assert!(
+            archetype.len() == table.entity_count(),
+            "archetype and it's table must have the same length. "
+        );
+
+        D::set_archetype(
+            &mut self.cursor.fetch,
+            &self.query_state.fetch_state,
+            archetype,
+            table,
+        );
+        F::set_archetype(
+            &mut self.cursor.filter,
+            &self.query_state.filter_state,
+            archetype,
+            table,
+        );
+        let entities = table.entities();
+        for row in rows {
+            // SAFETY: Caller assures `row` in range of the current archetype.
+            let entity = unsafe { *entities.get_unchecked(row) };
+            let row = TableRow::from_usize(row);
+
+            // SAFETY: set_table was called prior.
+            // Caller assures `row` in range of the current archetype.
+            let filter_matched = unsafe { F::filter_fetch(&mut self.cursor.filter, entity, row) };
+            if !filter_matched {
+                continue;
+            }
+
+            // SAFETY: set_table was called prior.
+            // Caller assures `row` in range of the current archetype.
+            let item = D::fetch(&mut self.cursor.fetch, entity, row);
+
+            accum = func(accum, item);
+        }
+        accum
+    }
+
    /// Sorts all query items into a new iterator, using the query lens as a key.
    ///
    /// This sort is stable (i.e., does not reorder equal elements).
@ -914,12 +978,27 @@ impl<'w, 's, D: QueryData, F: QueryFilter> Iterator for QueryIter<'w, 's, D, F>
                let archetype =
                    // SAFETY: Matched archetype IDs are guaranteed to still exist.
                    unsafe { self.archetypes.get(id.archetype_id).debug_checked_unwrap() };
-                accum =
+                // SAFETY: Matched table IDs are guaranteed to still exist.
+                let table = unsafe { self.tables.get(archetype.table_id()).debug_checked_unwrap() };
+
+                // When an archetype and its table have equal entity counts, dense iteration can be safely used.
+                // this leverages cache locality to optimize performance.
+                if table.entity_count() == archetype.len() {
+                    accum =
+                    // SAFETY:
+                    // - The fetched archetype matches both D and F
+                    // - The provided archetype and its' table have the same length.
+                    // - The provided range is equivalent to [0, archetype.len)
+                    // - The if block ensures that ether D::IS_DENSE or F::IS_DENSE are false
+                    unsafe { self.fold_over_dense_archetype_range(accum, &mut func, archetype,0..archetype.len()) };
+                } else {
+                    accum =
                    // SAFETY:
                    // - The fetched archetype matches both D and F
                    // - The provided range is equivalent to [0, archetype.len)
                    // - The if block ensures that ether D::IS_DENSE or F::IS_DENSE are false
-                    unsafe { self.fold_over_archetype_range(accum, &mut func, archetype, 0..archetype.len()) };
+                    unsafe { self.fold_over_archetype_range(accum, &mut func, archetype,0..archetype.len()) };
+                }
            }
        }
        accum