Basic adaptive batching for parallel query iteration (#4777)

# Objective Fixes #3184. Fixes #6640. Fixes #4798. Using `Query::par_for_each(_mut)` currently requires a `batch_size` parameter, which affects how it chunks up large archetypes and tables into smaller chunks to run in parallel. Tuning this value is difficult, as the performance characteristics entirely depends on the state of the `World` it's being run on. Typically, users will just use a flat constant and just tune it by hand until it performs well in some benchmarks. However, this is both error prone and risks overfitting the tuning on that benchmark. This PR proposes a naive automatic batch-size computation based on the current state of the `World`. ## Background `Query::par_for_each(_mut)` schedules a new Task for every archetype or table that it matches. Archetypes/tables larger than the batch size are chunked into smaller tasks. Assuming every entity matched by the query has an identical workload, this makes the worst case scenario involve using a batch size equal to the size of the largest matched archetype or table. Conversely, a batch size of `max {archetype, table} size / thread count * COUNT_PER_THREAD` is likely the sweetspot where the overhead of scheduling tasks is minimized, at least not without grouping small archetypes/tables together. There is also likely a strict minimum batch size below which the overhead of scheduling these tasks is heavier than running the entire thing single-threaded. ## Solution - [x] Remove the `batch_size` from `Query(State)::par_for_each` and friends. - [x] Add a check to compute `batch_size = max {archeytpe/table} size / thread count * COUNT_PER_THREAD` - [x] ~~Panic if thread count is 0.~~ Defer to `for_each` if the thread count is 1 or less. - [x] Early return if there is no matched table/archetype. - [x] Add override option for users have queries that strongly violate the initial assumption that all iterated entities have an equal workload. --- ## Changelog Changed: `Query::par_for_each(_mut)` has been changed to `Query::par_iter(_mut)` and will now automatically try to produce a batch size for callers based on the current `World` state. ## Migration Guide The `batch_size` parameter for `Query(State)::par_for_each(_mut)` has been removed. These calls will automatically compute a batch size for you. Remove these parameters from all calls to these functions. Before: ```rust fn parallel_system(query: Query<&MyComponent>) { query.par_for_each(32, |comp| { ... }); } ``` After: ```rust fn parallel_system(query: Query<&MyComponent>) { query.par_iter().for_each(|comp| { ... }); } ``` Co-authored-by: Arnav Choubey <56453634+x-52@users.noreply.github.com> Co-authored-by: Robert Swain <robert.swain@gmail.com> Co-authored-by: François <mockersf@gmail.com> Co-authored-by: Corey Farwell <coreyf@rwell.org> Co-authored-by: Aevyrie <aevyrie@gmail.com>
2024-09-20 06:22:01 +00:00 · 2023-01-20 08:47:20 +00:00 · 2023-01-20 08:47:20 +00:00 · dfea88c64d
commit dfea88c64d
parent cab065bad4
11 changed files with 300 additions and 194 deletions
--- a/benches/benches/bevy_ecs/iteration/heavy_compute.rs
+++ b/benches/benches/bevy_ecs/iteration/heavy_compute.rs
@ -34,7 +34,7 @@ pub fn heavy_compute(c: &mut Criterion) {
        }));

        fn sys(mut query: Query<(&mut Position, &mut Transform)>) {
-            query.par_for_each_mut(128, |(mut pos, mut mat)| {
+            query.par_iter_mut().for_each_mut(|(mut pos, mut mat)| {
                for _ in 0..100 {
                    mat.0 = mat.0.inverse();
                }
--- a/crates/bevy_animation/src/lib.rs
+++ b/crates/bevy_animation/src/lib.rs
@ -352,20 +352,22 @@ pub fn animation_player(
    parents: Query<(Option<With<AnimationPlayer>>, Option<&Parent>)>,
    mut animation_players: Query<(Entity, Option<&Parent>, &mut AnimationPlayer)>,
 ) {
-    animation_players.par_for_each_mut(10, |(root, maybe_parent, mut player)| {
-        update_transitions(&mut player, &time);
-        run_animation_player(
-            root,
-            player,
-            &time,
-            &animations,
-            &names,
-            &transforms,
-            maybe_parent,
-            &parents,
-            &children,
-        );
-    });
+    animation_players
+        .par_iter_mut()
+        .for_each_mut(|(root, maybe_parent, mut player)| {
+            update_transitions(&mut player, &time);
+            run_animation_player(
+                root,
+                player,
+                &time,
+                &animations,
+                &names,
+                &transforms,
+                maybe_parent,
+                &parents,
+                &children,
+            );
+        });
 }

 #[allow(clippy::too_many_arguments)]
--- a/crates/bevy_ecs/src/lib.rs
+++ b/crates/bevy_ecs/src/lib.rs
@ -400,7 +400,8 @@ mod tests {
        let results = Arc::new(Mutex::new(Vec::new()));
        world
            .query::<(Entity, &A)>()
-            .par_for_each(&world, 2, |(e, &A(i))| {
+            .par_iter(&world)
+            .for_each(|(e, &A(i))| {
                results.lock().unwrap().push((e, i));
            });
        results.lock().unwrap().sort();
@ -420,11 +421,10 @@ mod tests {
        let e4 = world.spawn((SparseStored(4), A(1))).id();
        let e5 = world.spawn((SparseStored(5), A(1))).id();
        let results = Arc::new(Mutex::new(Vec::new()));
-        world.query::<(Entity, &SparseStored)>().par_for_each(
-            &world,
-            2,
-            |(e, &SparseStored(i))| results.lock().unwrap().push((e, i)),
-        );
+        world
+            .query::<(Entity, &SparseStored)>()
+            .par_iter(&world)
+            .for_each(|(e, &SparseStored(i))| results.lock().unwrap().push((e, i)));
        results.lock().unwrap().sort();
        assert_eq!(
            &*results.lock().unwrap(),
--- a/crates/bevy_ecs/src/query/mod.rs
+++ b/crates/bevy_ecs/src/query/mod.rs
@ -2,12 +2,14 @@ mod access;
 mod fetch;
 mod filter;
 mod iter;
+mod par_iter;
 mod state;

 pub use access::*;
 pub use fetch::*;
 pub use filter::*;
 pub use iter::*;
+pub use par_iter::*;
 pub use state::*;

 /// A debug checked version of [`Option::unwrap_unchecked`]. Will panic in
--- a/crates/bevy_ecs/src/query/par_iter.rs
+++ b/crates/bevy_ecs/src/query/par_iter.rs
@ -0,0 +1,202 @@
+use crate::world::World;
+use bevy_tasks::ComputeTaskPool;
+use std::ops::Range;
+
+use super::{QueryItem, QueryState, ROQueryItem, ReadOnlyWorldQuery, WorldQuery};
+
+/// Dictates how a parallel query chunks up large tables/archetypes
+/// during iteration.
+///
+/// A parallel query will chunk up large tables and archetypes into
+/// chunks of at most a certain batch size.
+///
+/// By default, this batch size is automatically determined by dividing
+/// the size of the largest matched archetype by the number
+/// of threads. This attempts to minimize the overhead of scheduling
+/// tasks onto multiple threads, but assumes each entity has roughly the
+/// same amount of work to be done, which may not hold true in every
+/// workload.
+///
+/// See [`Query::par_iter`] for more information.
+///
+/// [`Query::par_iter`]: crate::system::Query::par_iter
+#[derive(Clone)]
+pub struct BatchingStrategy {
+    /// The upper and lower limits for how large a batch of entities.
+    ///
+    /// Setting the bounds to the same value will result in a fixed
+    /// batch size.
+    ///
+    /// Defaults to `[1, usize::MAX]`.
+    pub batch_size_limits: Range<usize>,
+    /// The number of batches per thread in the [`ComputeTaskPool`].
+    /// Increasing this value will decrease the batch size, which may
+    /// increase the scheduling overhead for the iteration.
+    ///
+    /// Defaults to 1.
+    pub batches_per_thread: usize,
+}
+
+impl BatchingStrategy {
+    /// Creates a new unconstrained default batching strategy.
+    pub const fn new() -> Self {
+        Self {
+            batch_size_limits: 1..usize::MAX,
+            batches_per_thread: 1,
+        }
+    }
+
+    /// Declares a batching strategy with a fixed batch size.
+    pub const fn fixed(batch_size: usize) -> Self {
+        Self {
+            batch_size_limits: batch_size..batch_size,
+            batches_per_thread: 1,
+        }
+    }
+
+    pub const fn min_batch_size(mut self, batch_size: usize) -> Self {
+        self.batch_size_limits.start = batch_size;
+        self
+    }
+
+    pub const fn max_batch_size(mut self, batch_size: usize) -> Self {
+        self.batch_size_limits.end = batch_size;
+        self
+    }
+
+    pub fn batches_per_thread(mut self, batches_per_thread: usize) -> Self {
+        assert!(
+            batches_per_thread > 0,
+            "The number of batches per thread must be non-zero."
+        );
+        self.batches_per_thread = batches_per_thread;
+        self
+    }
+}
+
+/// A parallel iterator over query results of a [`Query`](crate::system::Query).
+///
+/// This struct is created by the [`Query::par_iter`](crate::system::Query::iter) and
+/// [`Query::par_iter_mut`](crate::system::Query::iter_mut) methods.
+pub struct QueryParIter<'w, 's, Q: WorldQuery, F: ReadOnlyWorldQuery> {
+    pub(crate) world: &'w World,
+    pub(crate) state: &'s QueryState<Q, F>,
+    pub(crate) batching_strategy: BatchingStrategy,
+}
+
+impl<'w, 's, Q: ReadOnlyWorldQuery, F: ReadOnlyWorldQuery> QueryParIter<'w, 's, Q, F> {
+    /// Runs `func` on each query result in parallel.
+    ///
+    /// This can only be called for read-only queries, see [`Self::for_each_mut`] for
+    /// write-queries.
+    ///
+    /// # Panics
+    /// The [`ComputeTaskPool`] is not initialized. If using this from a query that is being
+    /// initialized and run from the ECS scheduler, this should never panic.
+    ///
+    /// [`ComputeTaskPool`]: bevy_tasks::ComputeTaskPool
+    #[inline]
+    pub fn for_each<FN: Fn(ROQueryItem<'w, Q>) + Send + Sync + Clone>(&self, func: FN) {
+        // SAFETY: query is read only
+        unsafe {
+            self.for_each_unchecked(func);
+        }
+    }
+}
+
+impl<'w, 's, Q: WorldQuery, F: ReadOnlyWorldQuery> QueryParIter<'w, 's, Q, F> {
+    /// Changes the batching strategy used when iterating.
+    ///
+    /// For more information on how this affects the resultant iteration, see
+    /// [`BatchingStrategy`].
+    pub fn batching_strategy(mut self, strategy: BatchingStrategy) -> Self {
+        self.batching_strategy = strategy;
+        self
+    }
+
+    /// Runs `func` on each query result in parallel.
+    ///
+    /// # Panics
+    /// The [`ComputeTaskPool`] is not initialized. If using this from a query that is being
+    /// initialized and run from the ECS scheduler, this should never panic.
+    ///
+    /// [`ComputeTaskPool`]: bevy_tasks::ComputeTaskPool
+    #[inline]
+    pub fn for_each_mut<FN: Fn(QueryItem<'w, Q>) + Send + Sync + Clone>(&mut self, func: FN) {
+        // SAFETY: query has unique world access
+        unsafe {
+            self.for_each_unchecked(func);
+        }
+    }
+
+    /// Runs `func` on each query result in parallel.
+    ///
+    /// # Panics
+    /// The [`ComputeTaskPool`] is not initialized. If using this from a query that is being
+    /// initialized and run from the ECS scheduler, this should never panic.
+    ///
+    /// # Safety
+    ///
+    /// This does not check for mutable query correctness. To be safe, make sure mutable queries
+    /// have unique access to the components they query.
+    ///
+    /// [`ComputeTaskPool`]: bevy_tasks::ComputeTaskPool
+    #[inline]
+    pub unsafe fn for_each_unchecked<FN: Fn(QueryItem<'w, Q>) + Send + Sync + Clone>(
+        &self,
+        func: FN,
+    ) {
+        let thread_count = ComputeTaskPool::get().thread_num();
+        if thread_count <= 1 {
+            self.state.for_each_unchecked_manual(
+                self.world,
+                func,
+                self.world.last_change_tick(),
+                self.world.read_change_tick(),
+            );
+        } else {
+            // Need a batch size of at least 1.
+            let batch_size = self.get_batch_size(thread_count).max(1);
+            self.state.par_for_each_unchecked_manual(
+                self.world,
+                batch_size,
+                func,
+                self.world.last_change_tick(),
+                self.world.read_change_tick(),
+            );
+        }
+    }
+
+    fn get_batch_size(&self, thread_count: usize) -> usize {
+        if self.batching_strategy.batch_size_limits.is_empty() {
+            return self.batching_strategy.batch_size_limits.start;
+        }
+
+        assert!(
+            thread_count > 0,
+            "Attempted to run parallel iteration over a query with an empty TaskPool"
+        );
+        let max_size = if Q::IS_DENSE && F::IS_DENSE {
+            let tables = &self.world.storages().tables;
+            self.state
+                .matched_table_ids
+                .iter()
+                .map(|id| tables[*id].entity_count())
+                .max()
+                .unwrap_or(0)
+        } else {
+            let archetypes = &self.world.archetypes();
+            self.state
+                .matched_archetype_ids
+                .iter()
+                .map(|id| archetypes[*id].len())
+                .max()
+                .unwrap_or(0)
+        };
+        let batch_size = max_size / (thread_count * self.batching_strategy.batches_per_thread);
+        batch_size.clamp(
+            self.batching_strategy.batch_size_limits.start,
+            self.batching_strategy.batch_size_limits.end,
+        )
+    }
+}
--- a/crates/bevy_ecs/src/query/state.rs
+++ b/crates/bevy_ecs/src/query/state.rs
@ -4,7 +4,8 @@ use crate::{
    entity::Entity,
    prelude::FromWorld,
    query::{
-        Access, DebugCheckedUnwrap, FilteredAccess, QueryCombinationIter, QueryIter, WorldQuery,
+        Access, BatchingStrategy, DebugCheckedUnwrap, FilteredAccess, QueryCombinationIter,
+        QueryIter, QueryParIter, WorldQuery,
    },
    storage::{TableId, TableRow},
    world::{World, WorldId},
@ -813,87 +814,34 @@ impl<Q: WorldQuery, F: ReadOnlyWorldQuery> QueryState<Q, F> {
        );
    }

-    /// Runs `func` on each query result in parallel.
+    /// Returns a parallel iterator over the query results for the given [`World`].
    ///
-    /// This can only be called for read-only queries, see [`Self::par_for_each_mut`] for
-    /// write-queries.
+    /// This can only be called for read-only queries, see [`par_iter_mut`] for write-queries.
    ///
-    /// # Panics
-    /// The [`ComputeTaskPool`] is not initialized. If using this from a query that is being
-    /// initialized and run from the ECS scheduler, this should never panic.
+    /// [`par_iter_mut`]: Self::par_iter_mut
    #[inline]
-    pub fn par_for_each<'w, FN: Fn(ROQueryItem<'w, Q>) + Send + Sync + Clone>(
-        &mut self,
-        world: &'w World,
-        batch_size: usize,
-        func: FN,
-    ) {
-        // SAFETY: query is read only
-        unsafe {
-            self.update_archetypes(world);
-            self.as_readonly().par_for_each_unchecked_manual(
-                world,
-                batch_size,
-                func,
-                world.last_change_tick(),
-                world.read_change_tick(),
-            );
-        }
-    }
-
-    /// Runs `func` on each query result in parallel.
-    ///
-    /// # Panics
-    /// The [`ComputeTaskPool`] is not initialized. If using this from a query that is being
-    /// initialized and run from the ECS scheduler, this should never panic.
-    #[inline]
-    pub fn par_for_each_mut<'w, FN: Fn(Q::Item<'w>) + Send + Sync + Clone>(
-        &mut self,
-        world: &'w mut World,
-        batch_size: usize,
-        func: FN,
-    ) {
-        let change_tick = world.change_tick();
-        // SAFETY: query has unique world access
-        unsafe {
-            self.update_archetypes(world);
-            self.par_for_each_unchecked_manual(
-                world,
-                batch_size,
-                func,
-                world.last_change_tick(),
-                change_tick,
-            );
-        }
-    }
-
-    /// Runs `func` on each query result in parallel.
-    ///
-    /// This can only be called for read-only queries.
-    ///
-    /// # Panics
-    /// The [`ComputeTaskPool`] is not initialized. If using this from a query that is being
-    /// initialized and run from the ECS scheduler, this should never panic.
-    ///
-    /// # Safety
-    ///
-    /// This does not check for mutable query correctness. To be safe, make sure mutable queries
-    /// have unique access to the components they query.
-    #[inline]
-    pub unsafe fn par_for_each_unchecked<'w, FN: Fn(Q::Item<'w>) + Send + Sync + Clone>(
-        &mut self,
-        world: &'w World,
-        batch_size: usize,
-        func: FN,
-    ) {
+    pub fn par_iter<'w, 's>(&'s mut self, world: &'w World) -> QueryParIter<'w, 's, Q, F> {
        self.update_archetypes(world);
-        self.par_for_each_unchecked_manual(
+        QueryParIter {
            world,
-            batch_size,
-            func,
-            world.last_change_tick(),
-            world.read_change_tick(),
-        );
+            state: self,
+            batching_strategy: BatchingStrategy::new(),
+        }
+    }
+
+    /// Returns a parallel iterator over the query results for the given [`World`].
+    ///
+    /// This can only be called for mutable queries, see [`par_iter`] for read-only-queries.
+    ///
+    /// [`par_iter`]: Self::par_iter
+    #[inline]
+    pub fn par_iter_mut<'w, 's>(&'s mut self, world: &'w mut World) -> QueryParIter<'w, 's, Q, F> {
+        self.update_archetypes(world);
+        QueryParIter {
+            world,
+            state: self,
+            batching_strategy: BatchingStrategy::new(),
+        }
    }

    /// Runs `func` on each query result for the given [`World`], where the last change and
--- a/crates/bevy_ecs/src/system/commands/parallel_scope.rs
+++ b/crates/bevy_ecs/src/system/commands/parallel_scope.rs
@ -17,7 +17,7 @@ pub struct ParallelCommandsState {
    thread_local_storage: ThreadLocal<Cell<CommandQueue>>,
 }

-/// An alternative to [`Commands`] that can be used in parallel contexts, such as those in [`Query::par_for_each`](crate::system::Query::par_for_each)
+/// An alternative to [`Commands`] that can be used in parallel contexts, such as those in [`Query::par_iter`](crate::system::Query::par_iter)
 ///
 /// Note: Because command application order will depend on how many threads are ran, non-commutative commands may result in non-deterministic results.
 ///
@ -33,7 +33,7 @@ pub struct ParallelCommandsState {
 ///     mut query: Query<(Entity, &Velocity)>,
 ///     par_commands: ParallelCommands
 /// ) {
-///     query.par_for_each(32, |(entity, velocity)| {
+///     query.par_iter().for_each(|(entity, velocity)| {
 ///         if velocity.magnitude() > 10.0 {
 ///             par_commands.command_scope(|mut commands| {
 ///                 commands.entity(entity).despawn();
--- a/crates/bevy_ecs/src/system/query.rs
+++ b/crates/bevy_ecs/src/system/query.rs
@ -2,8 +2,8 @@ use crate::{
    component::Component,
    entity::Entity,
    query::{
-        QueryCombinationIter, QueryEntityError, QueryIter, QueryManyIter, QuerySingleError,
-        QueryState, ROQueryItem, ReadOnlyWorldQuery, WorldQuery,
+        BatchingStrategy, QueryCombinationIter, QueryEntityError, QueryIter, QueryManyIter,
+        QueryParIter, QuerySingleError, QueryState, ROQueryItem, ReadOnlyWorldQuery, WorldQuery,
    },
    world::{Mut, World},
 };
@ -188,7 +188,7 @@ use std::{any::TypeId, borrow::Borrow, fmt::Debug};
 /// |Query methods|Effect|
 /// |:---:|---|
 /// |[`iter`]\([`_mut`][`iter_mut`])|Returns an iterator over all query items.|
-/// |[`for_each`]\([`_mut`][`for_each_mut`]),<br>[`par_for_each`]\([`_mut`][`par_for_each_mut`])|Runs a specified function for each query item.|
+/// |[`for_each`]\([`_mut`][`for_each_mut`]),<br>[`par_iter`]\([`_mut`][`par_iter_mut`])|Runs a specified function for each query item.|
 /// |[`iter_many`]\([`_mut`][`iter_many_mut`])|Iterates or runs a specified function over query items generated by a list of entities.|
 /// |[`iter_combinations`]\([`_mut`][`iter_combinations_mut`])|Returns an iterator over all combinations of a specified number of query items.|
 /// |[`get`]\([`_mut`][`get_mut`])|Returns the query item for the specified entity.|
@ -224,7 +224,7 @@ use std::{any::TypeId, borrow::Borrow, fmt::Debug};
 /// |Query operation|Computational complexity|
 /// |:---:|:---:|
 /// |[`iter`]\([`_mut`][`iter_mut`])|O(n)|
-/// |[`for_each`]\([`_mut`][`for_each_mut`]),<br>[`par_for_each`]\([`_mut`][`par_for_each_mut`])|O(n)|
+/// |[`for_each`]\([`_mut`][`for_each_mut`]),<br>[`par_iter`]\([`_mut`][`par_iter_mut`])|O(n)|
 /// |[`iter_many`]\([`_mut`][`iter_many_mut`])|O(k)|
 /// |[`iter_combinations`]\([`_mut`][`iter_combinations_mut`])|O(<sub>n</sub>C<sub>r</sub>)|
 /// |[`get`]\([`_mut`][`get_mut`])|O(1)|
@ -263,8 +263,8 @@ use std::{any::TypeId, borrow::Borrow, fmt::Debug};
 /// [`many`]: Self::many
 /// [`many_mut`]: Self::many_mut
 /// [`Or`]: crate::query::Or
-/// [`par_for_each`]: Self::par_for_each
-/// [`par_for_each_mut`]: Self::par_for_each_mut
+/// [`par_iter`]: Self::par_iter
+/// [`par_iter_mut`]: Self::par_iter_mut
 /// [performance]: #performance
 /// [`single`]: Self::single
 /// [`single_mut`]: Self::single_mut
@ -734,80 +734,32 @@ impl<'w, 's, Q: WorldQuery, F: ReadOnlyWorldQuery> Query<'w, 's, Q, F> {
        };
    }

-    /// Runs `f` on each read-only query item in parallel.
+    /// Returns a parallel iterator over the query results for the given [`World`].
    ///
-    /// Parallelization is achieved by using the [`World`]'s [`ComputeTaskPool`].
+    /// This can only be called for read-only queries, see [`par_iter_mut`] for write-queries.
    ///
-    /// # Tasks and batch size
-    ///
-    /// The items in the query get sorted into batches.
-    /// Internally, this function spawns a group of futures that each take on a `batch_size` sized section of the items (or less if the division is not perfect).
-    /// Then, the tasks in the [`ComputeTaskPool`] work through these futures.
-    ///
-    /// You can use this value to tune between maximum multithreading ability (many small batches) and minimum parallelization overhead (few big batches).
-    /// Rule of thumb: If the function body is (mostly) computationally expensive but there are not many items, a small batch size (=more batches) may help to even out the load.
-    /// If the body is computationally cheap and you have many items, a large batch size (=fewer batches) avoids spawning additional futures that don't help to even out the load.
-    ///
-    /// [`ComputeTaskPool`]: bevy_tasks::prelude::ComputeTaskPool
-    ///
-    /// # Panics
-    ///
-    /// This method panics if the [`ComputeTaskPool`] resource is added to the `World` before using this method.
-    /// If using this from a query that is being initialized and run from the [`Schedule`](crate::schedule::Schedule), this never panics.
-    ///
-    /// # See also
-    ///
-    /// - [`par_for_each_mut`](Self::par_for_each_mut) for operating on mutable query items.
+    /// [`par_iter_mut`]: Self::par_iter_mut
    #[inline]
-    pub fn par_for_each<'this>(
-        &'this self,
-        batch_size: usize,
-        f: impl Fn(ROQueryItem<'this, Q>) + Send + Sync + Clone,
-    ) {
-        // SAFETY: system runs without conflicts with other systems. same-system queries have runtime
-        // borrow checks when they conflict
-        unsafe {
-            self.state.as_readonly().par_for_each_unchecked_manual(
-                self.world,
-                batch_size,
-                f,
-                self.last_change_tick,
-                self.change_tick,
-            );
-        };
+    pub fn par_iter(&mut self) -> QueryParIter<'_, '_, Q::ReadOnly, F::ReadOnly> {
+        QueryParIter {
+            world: self.world,
+            state: self.state.as_readonly(),
+            batching_strategy: BatchingStrategy::new(),
+        }
    }

-    /// Runs `f` on each read-only query item in parallel.
+    /// Returns a parallel iterator over the query results for the given [`World`].
    ///
-    /// Parallelization is achieved by using the [`World`]'s [`ComputeTaskPool`].
+    /// This can only be called for mutable queries, see [`par_iter`] for read-only-queries.
    ///
-    /// # Panics
-    ///
-    /// This method panics if the [`ComputeTaskPool`] resource is added to the `World` before using this method.
-    /// If using this from a query that is being initialized and run from the [`Schedule`](crate::schedule::Schedule), this never panics.
-    ///
-    /// [`ComputeTaskPool`]: bevy_tasks::prelude::ComputeTaskPool
-    ///
-    /// # See also
-    ///
-    /// - [`par_for_each`](Self::par_for_each) for more usage details.
+    /// [`par_iter`]: Self::par_iter
    #[inline]
-    pub fn par_for_each_mut<'a>(
-        &'a mut self,
-        batch_size: usize,
-        f: impl Fn(Q::Item<'a>) + Send + Sync + Clone,
-    ) {
-        // SAFETY: system runs without conflicts with other systems. same-system queries have runtime
-        // borrow checks when they conflict
-        unsafe {
-            self.state.par_for_each_unchecked_manual(
-                self.world,
-                batch_size,
-                f,
-                self.last_change_tick,
-                self.change_tick,
-            );
-        };
+    pub fn par_iter_mut(&mut self) -> QueryParIter<'_, '_, Q, F> {
+        QueryParIter {
+            world: self.world,
+            state: self.state,
+            batching_strategy: BatchingStrategy::new(),
+        }
    }

    /// Returns the read-only query item for the given [`Entity`].
--- a/crates/bevy_render/src/view/visibility/mod.rs
+++ b/crates/bevy_render/src/view/visibility/mod.rs
@ -350,9 +350,6 @@ fn propagate_recursive(
    Ok(())
 }

-// the batch size used for check_visibility, chosen because this number tends to perform well
-const VISIBLE_ENTITIES_QUERY_BATCH_SIZE: usize = 1024;
-
 /// System updating the visibility of entities each frame.
 ///
 /// The system is labelled with [`VisibilitySystems::CheckVisibility`]. Each frame, it updates the
@ -376,9 +373,9 @@ pub fn check_visibility(
 ) {
    for (mut visible_entities, frustum, maybe_view_mask) in &mut view_query {
        let view_mask = maybe_view_mask.copied().unwrap_or_default();
+
        visible_entities.entities.clear();
-        visible_aabb_query.par_for_each_mut(
-            VISIBLE_ENTITIES_QUERY_BATCH_SIZE,
+        visible_aabb_query.par_iter_mut().for_each_mut(
            |(
                entity,
                mut computed_visibility,
@ -423,8 +420,7 @@ pub fn check_visibility(
            },
        );

-        visible_no_aabb_query.par_for_each_mut(
-            VISIBLE_ENTITIES_QUERY_BATCH_SIZE,
+        visible_no_aabb_query.par_iter_mut().for_each_mut(
            |(entity, mut computed_visibility, maybe_entity_mask)| {
                // skip computing visibility for entities that are configured to be hidden. is_visible_in_view has already been set to false
                // in visibility_propagate_system
--- a/crates/bevy_transform/src/systems.rs
+++ b/crates/bevy_transform/src/systems.rs
@ -15,9 +15,11 @@ pub fn sync_simple_transforms(
        (Changed<Transform>, Without<Parent>, Without<Children>),
    >,
 ) {
-    query.par_for_each_mut(1024, |(transform, mut global_transform)| {
-        *global_transform = GlobalTransform::from(*transform);
-    });
+    query
+        .par_iter_mut()
+        .for_each_mut(|(transform, mut global_transform)| {
+            *global_transform = GlobalTransform::from(*transform);
+        });
 }

 /// Update [`GlobalTransform`] component of entities based on entity hierarchy and
@ -33,11 +35,7 @@ pub fn propagate_transforms(
    transform_query: Query<(Ref<Transform>, &mut GlobalTransform, Option<&Children>), With<Parent>>,
    parent_query: Query<(Entity, Ref<Parent>)>,
 ) {
-    root_query.par_for_each_mut(
-        // The differing depths and sizes of hierarchy trees causes the work for each root to be
-        // different. A batch size of 1 ensures that each tree gets it's own task and multiple
-        // large trees are not clumped together.
-        1,
+    root_query.par_iter_mut().for_each_mut(
        |(entity, children, transform, mut global_transform)| {
            let changed = transform.is_changed();
            if changed {
--- a/examples/ecs/parallel_query.rs
+++ b/examples/ecs/parallel_query.rs
@ -1,5 +1,6 @@
 //! Illustrates parallel queries with `ParallelIterator`.

+use bevy::ecs::query::BatchingStrategy;
 use bevy::prelude::*;
 use rand::random;

@ -24,16 +25,18 @@ fn spawn_system(mut commands: Commands, asset_server: Res<AssetServer>) {
 // Move sprites according to their velocity
 fn move_system(mut sprites: Query<(&mut Transform, &Velocity)>) {
    // Compute the new location of each sprite in parallel on the
-    // ComputeTaskPool using batches of 32 sprites
+    // ComputeTaskPool
    //
    // This example is only for demonstrative purposes. Using a
    // ParallelIterator for an inexpensive operation like addition on only 128
    // elements will not typically be faster than just using a normal Iterator.
    // See the ParallelIterator documentation for more information on when
    // to use or not use ParallelIterator over a normal Iterator.
-    sprites.par_for_each_mut(32, |(mut transform, velocity)| {
-        transform.translation += velocity.extend(0.0);
-    });
+    sprites
+        .par_iter_mut()
+        .for_each_mut(|(mut transform, velocity)| {
+            transform.translation += velocity.extend(0.0);
+        });
 }

 // Bounce sprites outside the window
@ -45,10 +48,13 @@ fn bounce_system(windows: Query<&Window>, mut sprites: Query<(&Transform, &mut V
    let right = width / 2.0;
    let bottom = height / -2.0;
    let top = height / 2.0;
+    // The default batch size can also be overridden.
+    // In this case a batch size of 32 is chosen to limit the overhead of
+    // ParallelIterator, since negating a vector is very inexpensive.
    sprites
-        // Batch size of 32 is chosen to limit the overhead of
-        // ParallelIterator, since negating a vector is very inexpensive.
-        .par_for_each_mut(32, |(transform, mut v)| {
+        .par_iter_mut()
+        .batching_strategy(BatchingStrategy::fixed(32))
+        .for_each_mut(|(transform, mut v)| {
            if !(left < transform.translation.x
                && transform.translation.x < right
                && bottom < transform.translation.y