Use multidraw for opaque meshes when GPU culling is in use. (#16427)

This commit adds support for *multidraw*, which is a feature that allows multiple meshes to be drawn in a single drawcall. `wgpu` currently implements multidraw on Vulkan, so this feature is only enabled there. Multiple meshes can be drawn at once if they're in the same vertex and index buffers and are otherwise placed in the same bin. (Thus, for example, at present the materials and textures must be identical, but see #16368.) Multidraw is a significant performance improvement during the draw phase because it reduces the number of rebindings, as well as the number of drawcalls. This feature is currently only enabled when GPU culling is used: i.e. when `GpuCulling` is present on a camera. Therefore, if you run for example `scene_viewer`, you will not see any performance improvements, because `scene_viewer` doesn't add the `GpuCulling` component to its camera. Additionally, the multidraw feature is only implemented for opaque 3D meshes and not for shadows or 2D meshes. I plan to make GPU culling the default and to extend the feature to shadows in the future. Also, in the future I suspect that polyfilling multidraw on APIs that don't support it will be fruitful, as even without driver-level support use of multidraw allows us to avoid expensive `wgpu` rebindings.
2024-12-18 17:13:10 +00:00 · 2024-12-06 09:22:03 -08:00 · 2024-12-06 09:22:03 -08:00 · f5de3f08fb
commit f5de3f08fb
parent 4d6b02af89
28 changed files with 669 additions and 310 deletions
--- a/crates/bevy_core_pipeline/src/core_2d/mod.rs
+++ b/crates/bevy_core_pipeline/src/core_2d/mod.rs
@ -33,6 +33,8 @@ pub mod graph {
 use core::ops::Range;

 use bevy_asset::UntypedAssetId;
+use bevy_render::batching::gpu_preprocessing::GpuPreprocessingMode;
+use bevy_render::render_phase::PhaseItemBinKey;
 use bevy_utils::HashMap;
 pub use camera_2d::*;
 pub use main_opaque_pass_2d_node::*;
@ -153,6 +155,14 @@ pub struct Opaque2dBinKey {
    pub material_bind_group_id: Option<BindGroupId>,
 }

+impl PhaseItemBinKey for Opaque2dBinKey {
+    type BatchSetKey = ();
+
+    fn get_batch_set_key(&self) -> Option<Self::BatchSetKey> {
+        None
+    }
+}
+
 impl PhaseItem for Opaque2d {
    #[inline]
    fn entity(&self) -> Entity {
@ -179,7 +189,7 @@ impl PhaseItem for Opaque2d {
    }

    fn extra_index(&self) -> PhaseItemExtraIndex {
-        self.extra_index
+        self.extra_index.clone()
    }

    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
@ -269,7 +279,7 @@ impl PhaseItem for AlphaMask2d {
    }

    fn extra_index(&self) -> PhaseItemExtraIndex {
-        self.extra_index
+        self.extra_index.clone()
    }

    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
@ -295,6 +305,14 @@ impl BinnedPhaseItem for AlphaMask2d {
    }
 }

+impl PhaseItemBinKey for AlphaMask2dBinKey {
+    type BatchSetKey = ();
+
+    fn get_batch_set_key(&self) -> Option<Self::BatchSetKey> {
+        None
+    }
+}
+
 impl CachedRenderPipelinePhaseItem for AlphaMask2d {
    #[inline]
    fn cached_pipeline(&self) -> CachedRenderPipelineId {
@ -340,7 +358,7 @@ impl PhaseItem for Transparent2d {

    #[inline]
    fn extra_index(&self) -> PhaseItemExtraIndex {
-        self.extra_index
+        self.extra_index.clone()
    }

    #[inline]
@ -385,8 +403,8 @@ pub fn extract_core_2d_camera_phases(
            continue;
        }
        transparent_2d_phases.insert_or_clear(entity);
-        opaque_2d_phases.insert_or_clear(entity);
-        alpha_mask_2d_phases.insert_or_clear(entity);
+        opaque_2d_phases.insert_or_clear(entity, GpuPreprocessingMode::None);
+        alpha_mask_2d_phases.insert_or_clear(entity, GpuPreprocessingMode::None);

        live_entities.insert(entity);
    }
--- a/crates/bevy_core_pipeline/src/core_3d/mod.rs
+++ b/crates/bevy_core_pipeline/src/core_3d/mod.rs
@ -65,6 +65,10 @@ pub const DEPTH_TEXTURE_SAMPLING_SUPPORTED: bool = true;

 use core::ops::Range;

+use bevy_render::batching::gpu_preprocessing::{GpuPreprocessingMode, GpuPreprocessingSupport};
+use bevy_render::mesh::allocator::SlabId;
+use bevy_render::render_phase::PhaseItemBinKey;
+use bevy_render::view::GpuCulling;
 pub use camera_3d::*;
 pub use main_opaque_pass_3d_node::*;
 pub use main_transparent_pass_3d_node::*;
@ -224,9 +228,13 @@ pub struct Opaque3d {
    pub extra_index: PhaseItemExtraIndex,
 }

-/// Data that must be identical in order to batch phase items together.
+/// Information that must be identical in order to place opaque meshes in the
+/// same *batch set*.
+///
+/// A batch set is a set of batches that can be multi-drawn together, if
+/// multi-draw is in use.
 #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub struct Opaque3dBinKey {
+pub struct Opaque3dBatchSetKey {
    /// The identifier of the render pipeline.
    pub pipeline: CachedRenderPipelineId,

@ -238,14 +246,45 @@ pub struct Opaque3dBinKey {
    /// In the case of PBR, this is the `MaterialBindGroupIndex`.
    pub material_bind_group_index: Option<u32>,

+    /// The ID of the slab of GPU memory that contains vertex data.
+    ///
+    /// For non-mesh items, you can fill this with 0 if your items can be
+    /// multi-drawn, or with a unique value if they can't.
+    pub vertex_slab: SlabId,
+
+    /// The ID of the slab of GPU memory that contains index data, if present.
+    ///
+    /// For non-mesh items, you can safely fill this with `None`.
+    pub index_slab: Option<SlabId>,
+
+    /// The lightmap, if present.
+    pub lightmap_image: Option<AssetId<Image>>,
+}
+
+/// Data that must be identical in order to *batch* phase items together.
+///
+/// Note that a *batch set* (if multi-draw is in use) contains multiple batches.
+#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct Opaque3dBinKey {
+    /// The key of the *batch set*.
+    ///
+    /// As batches belong to a batch set, meshes in a batch must obviously be
+    /// able to be placed in a single batch set.
+    pub batch_set_key: Opaque3dBatchSetKey,
+
    /// The asset that this phase item is associated with.
    ///
    /// Normally, this is the ID of the mesh, but for non-mesh items it might be
    /// the ID of another type of asset.
    pub asset_id: UntypedAssetId,
+}

-    /// The lightmap, if present.
-    pub lightmap_image: Option<AssetId<Image>>,
+impl PhaseItemBinKey for Opaque3dBinKey {
+    type BatchSetKey = Opaque3dBatchSetKey;
+
+    fn get_batch_set_key(&self) -> Option<Self::BatchSetKey> {
+        Some(self.batch_set_key.clone())
+    }
 }

 impl PhaseItem for Opaque3d {
@ -261,7 +300,7 @@ impl PhaseItem for Opaque3d {

    #[inline]
    fn draw_function(&self) -> DrawFunctionId {
-        self.key.draw_function
+        self.key.batch_set_key.draw_function
    }

    #[inline]
@ -275,7 +314,7 @@ impl PhaseItem for Opaque3d {
    }

    fn extra_index(&self) -> PhaseItemExtraIndex {
-        self.extra_index
+        self.extra_index.clone()
    }

    fn batch_range_and_extra_index_mut(&mut self) -> (&mut Range<u32>, &mut PhaseItemExtraIndex) {
@ -305,7 +344,7 @@ impl BinnedPhaseItem for Opaque3d {
 impl CachedRenderPipelinePhaseItem for Opaque3d {
    #[inline]
    fn cached_pipeline(&self) -> CachedRenderPipelineId {
-        self.key.pipeline
+        self.key.batch_set_key.pipeline
    }
 }

@ -343,7 +382,7 @@ impl PhaseItem for AlphaMask3d {

    #[inline]
    fn extra_index(&self) -> PhaseItemExtraIndex {
-        self.extra_index
+        self.extra_index.clone()
    }

    #[inline]
@ -426,7 +465,7 @@ impl PhaseItem for Transmissive3d {

    #[inline]
    fn extra_index(&self) -> PhaseItemExtraIndex {
-        self.extra_index
+        self.extra_index.clone()
    }

    #[inline]
@ -493,7 +532,7 @@ impl PhaseItem for Transparent3d {

    #[inline]
    fn extra_index(&self) -> PhaseItemExtraIndex {
-        self.extra_index
+        self.extra_index.clone()
    }

    #[inline]
@ -529,18 +568,27 @@ pub fn extract_core_3d_camera_phases(
    mut alpha_mask_3d_phases: ResMut<ViewBinnedRenderPhases<AlphaMask3d>>,
    mut transmissive_3d_phases: ResMut<ViewSortedRenderPhases<Transmissive3d>>,
    mut transparent_3d_phases: ResMut<ViewSortedRenderPhases<Transparent3d>>,
-    cameras_3d: Extract<Query<(RenderEntity, &Camera), With<Camera3d>>>,
+    cameras_3d: Extract<Query<(RenderEntity, &Camera, Has<GpuCulling>), With<Camera3d>>>,
    mut live_entities: Local<EntityHashSet>,
+    gpu_preprocessing_support: Res<GpuPreprocessingSupport>,
 ) {
    live_entities.clear();

-    for (entity, camera) in &cameras_3d {
+    for (entity, camera, has_gpu_culling) in &cameras_3d {
        if !camera.is_active {
            continue;
        }

-        opaque_3d_phases.insert_or_clear(entity);
-        alpha_mask_3d_phases.insert_or_clear(entity);
+        // If GPU culling is in use, use it (and indirect mode); otherwise, just
+        // preprocess the meshes.
+        let gpu_preprocessing_mode = gpu_preprocessing_support.min(if has_gpu_culling {
+            GpuPreprocessingMode::Culling
+        } else {
+            GpuPreprocessingMode::PreprocessingOnly
+        });
+
+        opaque_3d_phases.insert_or_clear(entity, gpu_preprocessing_mode);
+        alpha_mask_3d_phases.insert_or_clear(entity, gpu_preprocessing_mode);
        transmissive_3d_phases.insert_or_clear(entity);
        transparent_3d_phases.insert_or_clear(entity);

@ -554,6 +602,8 @@ pub fn extract_core_3d_camera_phases(
 }

 // Extract the render phases for the prepass
+
+#[allow(clippy::too_many_arguments)]
 pub fn extract_camera_prepass_phase(
    mut commands: Commands,
    mut opaque_3d_prepass_phases: ResMut<ViewBinnedRenderPhases<Opaque3dPrepass>>,
@ -565,6 +615,7 @@ pub fn extract_camera_prepass_phase(
            (
                RenderEntity,
                &Camera,
+                Has<GpuCulling>,
                Has<DepthPrepass>,
                Has<NormalPrepass>,
                Has<MotionVectorPrepass>,
@ -574,27 +625,43 @@ pub fn extract_camera_prepass_phase(
        >,
    >,
    mut live_entities: Local<EntityHashSet>,
+    gpu_preprocessing_support: Res<GpuPreprocessingSupport>,
 ) {
    live_entities.clear();

-    for (entity, camera, depth_prepass, normal_prepass, motion_vector_prepass, deferred_prepass) in
-        cameras_3d.iter()
+    for (
+        entity,
+        camera,
+        gpu_culling,
+        depth_prepass,
+        normal_prepass,
+        motion_vector_prepass,
+        deferred_prepass,
+    ) in cameras_3d.iter()
    {
        if !camera.is_active {
            continue;
        }

+        // If GPU culling is in use, use it (and indirect mode); otherwise, just
+        // preprocess the meshes.
+        let gpu_preprocessing_mode = gpu_preprocessing_support.min(if gpu_culling {
+            GpuPreprocessingMode::Culling
+        } else {
+            GpuPreprocessingMode::PreprocessingOnly
+        });
+
        if depth_prepass || normal_prepass || motion_vector_prepass {
-            opaque_3d_prepass_phases.insert_or_clear(entity);
-            alpha_mask_3d_prepass_phases.insert_or_clear(entity);
+            opaque_3d_prepass_phases.insert_or_clear(entity, gpu_preprocessing_mode);
+            alpha_mask_3d_prepass_phases.insert_or_clear(entity, gpu_preprocessing_mode);
        } else {
            opaque_3d_prepass_phases.remove(&entity);
            alpha_mask_3d_prepass_phases.remove(&entity);
        }

        if deferred_prepass {
-            opaque_3d_deferred_phases.insert_or_clear(entity);
-            alpha_mask_3d_deferred_phases.insert_or_clear(entity);
+            opaque_3d_deferred_phases.insert_or_clear(entity, gpu_preprocessing_mode);
+            alpha_mask_3d_deferred_phases.insert_or_clear(entity, gpu_preprocessing_mode);
        } else {
            opaque_3d_deferred_phases.remove(&entity);
            alpha_mask_3d_deferred_phases.remove(&entity);
--- a/crates/bevy_core_pipeline/src/deferred/mod.rs
+++ b/crates/bevy_core_pipeline/src/deferred/mod.rs
@ -58,7 +58,7 @@ impl PhaseItem for Opaque3dDeferred {

    #[inline]
    fn extra_index(&self) -> PhaseItemExtraIndex {
-        self.extra_index
+        self.extra_index.clone()
    }

    #[inline]
@ -133,7 +133,7 @@ impl PhaseItem for AlphaMask3dDeferred {

    #[inline]
    fn extra_index(&self) -> PhaseItemExtraIndex {
-        self.extra_index
+        self.extra_index.clone()
    }

    #[inline]
--- a/crates/bevy_core_pipeline/src/prepass/mod.rs
+++ b/crates/bevy_core_pipeline/src/prepass/mod.rs
@ -34,6 +34,7 @@ use bevy_asset::UntypedAssetId;
 use bevy_ecs::prelude::*;
 use bevy_math::Mat4;
 use bevy_reflect::{std_traits::ReflectDefault, Reflect};
+use bevy_render::render_phase::PhaseItemBinKey;
 use bevy_render::sync_world::MainEntity;
 use bevy_render::{
    render_phase::{
@ -167,6 +168,14 @@ pub struct OpaqueNoLightmap3dBinKey {
    pub asset_id: UntypedAssetId,
 }

+impl PhaseItemBinKey for OpaqueNoLightmap3dBinKey {
+    type BatchSetKey = ();
+
+    fn get_batch_set_key(&self) -> Option<Self::BatchSetKey> {
+        None
+    }
+}
+
 impl PhaseItem for Opaque3dPrepass {
    #[inline]
    fn entity(&self) -> Entity {
@ -194,7 +203,7 @@ impl PhaseItem for Opaque3dPrepass {

    #[inline]
    fn extra_index(&self) -> PhaseItemExtraIndex {
-        self.extra_index
+        self.extra_index.clone()
    }

    #[inline]
@ -268,7 +277,7 @@ impl PhaseItem for AlphaMask3dPrepass {

    #[inline]
    fn extra_index(&self) -> PhaseItemExtraIndex {
-        self.extra_index
+        self.extra_index.clone()
    }

    #[inline]
--- a/crates/bevy_gizmos/src/pipeline_2d.rs
+++ b/crates/bevy_gizmos/src/pipeline_2d.rs
@ -338,7 +338,7 @@ fn queue_line_gizmos_2d(
                    pipeline,
                    sort_key: FloatOrd(f32::INFINITY),
                    batch_range: 0..1,
-                    extra_index: PhaseItemExtraIndex::NONE,
+                    extra_index: PhaseItemExtraIndex::None,
                });
            }

@ -358,7 +358,7 @@ fn queue_line_gizmos_2d(
                    pipeline,
                    sort_key: FloatOrd(f32::INFINITY),
                    batch_range: 0..1,
-                    extra_index: PhaseItemExtraIndex::NONE,
+                    extra_index: PhaseItemExtraIndex::None,
                });
            }
        }
@ -417,7 +417,7 @@ fn queue_line_joint_gizmos_2d(
                pipeline,
                sort_key: FloatOrd(f32::INFINITY),
                batch_range: 0..1,
-                extra_index: PhaseItemExtraIndex::NONE,
+                extra_index: PhaseItemExtraIndex::None,
            });
        }
    }
--- a/crates/bevy_gizmos/src/pipeline_3d.rs
+++ b/crates/bevy_gizmos/src/pipeline_3d.rs
@ -369,7 +369,7 @@ fn queue_line_gizmos_3d(
                    pipeline,
                    distance: 0.,
                    batch_range: 0..1,
-                    extra_index: PhaseItemExtraIndex::NONE,
+                    extra_index: PhaseItemExtraIndex::None,
                });
            }

@ -390,7 +390,7 @@ fn queue_line_gizmos_3d(
                    pipeline,
                    distance: 0.,
                    batch_range: 0..1,
-                    extra_index: PhaseItemExtraIndex::NONE,
+                    extra_index: PhaseItemExtraIndex::None,
                });
            }
        }
@ -486,7 +486,7 @@ fn queue_line_joint_gizmos_3d(
                pipeline,
                distance: 0.,
                batch_range: 0..1,
-                extra_index: PhaseItemExtraIndex::NONE,
+                extra_index: PhaseItemExtraIndex::None,
            });
        }
    }
--- a/crates/bevy_pbr/src/material.rs
+++ b/crates/bevy_pbr/src/material.rs
@ -9,8 +9,8 @@ use crate::*;
 use bevy_asset::{Asset, AssetId, AssetServer};
 use bevy_core_pipeline::{
    core_3d::{
-        AlphaMask3d, Camera3d, Opaque3d, Opaque3dBinKey, ScreenSpaceTransmissionQuality,
-        Transmissive3d, Transparent3d,
+        AlphaMask3d, Camera3d, Opaque3d, Opaque3dBatchSetKey, Opaque3dBinKey,
+        ScreenSpaceTransmissionQuality, Transmissive3d, Transparent3d,
    },
    oit::OrderIndependentTransparencySettings,
    prepass::{
@ -28,7 +28,6 @@ use bevy_ecs::{
 };
 use bevy_reflect::std_traits::ReflectDefault;
 use bevy_reflect::Reflect;
-use bevy_render::view::RenderVisibleEntities;
 use bevy_render::{
    camera::TemporalJitter,
    extract_resource::ExtractResource,
@ -40,7 +39,8 @@ use bevy_render::{
    view::{ExtractedView, Msaa, RenderVisibilityRanges, ViewVisibility},
    Extract,
 };
-use bevy_render::{sync_world::MainEntityHashMap, texture::FallbackImage};
+use bevy_render::{mesh::allocator::MeshAllocator, sync_world::MainEntityHashMap};
+use bevy_render::{texture::FallbackImage, view::RenderVisibleEntities};
 use bevy_utils::{hashbrown::hash_map::Entry, tracing::error};
 use core::{hash::Hash, marker::PhantomData};

@ -634,7 +634,10 @@ pub fn queue_material_meshes<M: Material>(
    render_material_instances: Res<RenderMaterialInstances<M>>,
    render_lightmaps: Res<RenderLightmaps>,
    render_visibility_ranges: Res<RenderVisibilityRanges>,
-    material_bind_group_allocator: Res<MaterialBindGroupAllocator<M>>,
+    (mesh_allocator, material_bind_group_allocator): (
+        Res<MeshAllocator>,
+        Res<MaterialBindGroupAllocator<M>>,
+    ),
    mut opaque_render_phases: ResMut<ViewBinnedRenderPhases<Opaque3d>>,
    mut alpha_mask_render_phases: ResMut<ViewBinnedRenderPhases<AlphaMask3d>>,
    mut transmissive_render_phases: ResMut<ViewSortedRenderPhases<Transmissive3d>>,
@ -865,15 +868,21 @@ pub fn queue_material_meshes<M: Material>(
                            pipeline: pipeline_id,
                            distance,
                            batch_range: 0..1,
-                            extra_index: PhaseItemExtraIndex::NONE,
+                            extra_index: PhaseItemExtraIndex::None,
                        });
                    } else if material.properties.render_method == OpaqueRendererMethod::Forward {
+                        let (vertex_slab, index_slab) =
+                            mesh_allocator.mesh_slabs(&mesh_instance.mesh_asset_id);
                        let bin_key = Opaque3dBinKey {
-                            draw_function: draw_opaque_pbr,
-                            pipeline: pipeline_id,
+                            batch_set_key: Opaque3dBatchSetKey {
+                                draw_function: draw_opaque_pbr,
+                                pipeline: pipeline_id,
+                                material_bind_group_index: Some(material.binding.group.0),
+                                vertex_slab: vertex_slab.unwrap_or_default(),
+                                index_slab,
+                                lightmap_image,
+                            },
                            asset_id: mesh_instance.mesh_asset_id.into(),
-                            material_bind_group_index: Some(material.binding.group.0),
-                            lightmap_image,
                        };
                        opaque_phase.add(
                            bin_key,
@ -893,7 +902,7 @@ pub fn queue_material_meshes<M: Material>(
                            pipeline: pipeline_id,
                            distance,
                            batch_range: 0..1,
-                            extra_index: PhaseItemExtraIndex::NONE,
+                            extra_index: PhaseItemExtraIndex::None,
                        });
                    } else if material.properties.render_method == OpaqueRendererMethod::Forward {
                        let bin_key = OpaqueNoLightmap3dBinKey {
@ -918,7 +927,7 @@ pub fn queue_material_meshes<M: Material>(
                        pipeline: pipeline_id,
                        distance,
                        batch_range: 0..1,
-                        extra_index: PhaseItemExtraIndex::NONE,
+                        extra_index: PhaseItemExtraIndex::None,
                    });
                }
            }
--- a/crates/bevy_pbr/src/render/gpu_preprocess.rs
+++ b/crates/bevy_pbr/src/render/gpu_preprocess.rs
@ -137,9 +137,7 @@ impl Plugin for GpuMeshPreprocessPlugin {
        // This plugin does nothing if GPU instance buffer building isn't in
        // use.
        let gpu_preprocessing_support = render_app.world().resource::<GpuPreprocessingSupport>();
-        if !self.use_gpu_instance_buffer_builder
-            || *gpu_preprocessing_support == GpuPreprocessingSupport::None
-        {
+        if !self.use_gpu_instance_buffer_builder || !gpu_preprocessing_support.is_available() {
            return;
        }

--- a/crates/bevy_pbr/src/render/light.rs
+++ b/crates/bevy_pbr/src/render/light.rs
@ -11,8 +11,11 @@ use bevy_ecs::{
    system::lifetimeless::Read,
 };
 use bevy_math::{ops, Mat4, UVec4, Vec2, Vec3, Vec3Swizzles, Vec4, Vec4Swizzles};
-use bevy_render::camera::SortedCameras;
-use bevy_render::sync_world::{MainEntity, RenderEntity, TemporaryRenderEntity};
+use bevy_render::{
+    batching::gpu_preprocessing::{GpuPreprocessingMode, GpuPreprocessingSupport},
+    camera::SortedCameras,
+    mesh::allocator::MeshAllocator,
+};
 use bevy_render::{
    diagnostic::RecordDiagnostics,
    mesh::RenderMesh,
@ -26,6 +29,10 @@ use bevy_render::{
    view::{ExtractedView, RenderLayers, ViewVisibility},
    Extract,
 };
+use bevy_render::{
+    mesh::allocator::SlabId,
+    sync_world::{MainEntity, RenderEntity, TemporaryRenderEntity},
+};
 use bevy_transform::{components::GlobalTransform, prelude::Transform};
 #[cfg(feature = "trace")]
 use bevy_utils::tracing::info_span;
@ -673,8 +680,7 @@ pub(crate) fn spot_light_clip_from_view(angle: f32, near_z: f32) -> Mat4 {
 pub fn prepare_lights(
    mut commands: Commands,
    mut texture_cache: ResMut<TextureCache>,
-    render_device: Res<RenderDevice>,
-    render_queue: Res<RenderQueue>,
+    (render_device, render_queue): (Res<RenderDevice>, Res<RenderQueue>),
    mut global_light_meta: ResMut<GlobalClusterableObjectMeta>,
    mut light_meta: ResMut<LightMeta>,
    views: Query<
@ -703,6 +709,7 @@ pub fn prepare_lights(
    directional_lights: Query<(Entity, &ExtractedDirectionalLight)>,
    mut light_view_entities: Query<&mut LightViewEntities>,
    sorted_cameras: Res<SortedCameras>,
+    gpu_preprocessing_support: Res<GpuPreprocessingSupport>,
 ) {
    let views_iter = views.iter();
    let views_count = views_iter.len();
@ -1229,7 +1236,11 @@ pub fn prepare_lights(

                if first {
                    // Subsequent views with the same light entity will reuse the same shadow map
-                    shadow_render_phases.insert_or_clear(view_light_entity);
+                    // TODO: Implement GPU culling for shadow passes.
+                    shadow_render_phases.insert_or_clear(
+                        view_light_entity,
+                        gpu_preprocessing_support.min(GpuPreprocessingMode::PreprocessingOnly),
+                    );
                    live_shadow_mapping_lights.insert(view_light_entity);
                }
            }
@ -1317,7 +1328,10 @@ pub fn prepare_lights(

            if first {
                // Subsequent views with the same light entity will reuse the same shadow map
-                shadow_render_phases.insert_or_clear(view_light_entity);
+                shadow_render_phases.insert_or_clear(
+                    view_light_entity,
+                    gpu_preprocessing_support.min(GpuPreprocessingMode::PreprocessingOnly),
+                );
                live_shadow_mapping_lights.insert(view_light_entity);
            }
        }
@ -1447,7 +1461,11 @@ pub fn prepare_lights(

                // Subsequent views with the same light entity will **NOT** reuse the same shadow map
                // (Because the cascades are unique to each view)
-                shadow_render_phases.insert_or_clear(view_light_entity);
+                // TODO: Implement GPU culling for shadow passes.
+                shadow_render_phases.insert_or_clear(
+                    view_light_entity,
+                    gpu_preprocessing_support.min(GpuPreprocessingMode::PreprocessingOnly),
+                );
                live_shadow_mapping_lights.insert(view_light_entity);
            }
        }
@ -1498,15 +1516,20 @@ fn despawn_entities(commands: &mut Commands, entities: Vec<Entity>) {
 pub fn queue_shadows<M: Material>(
    shadow_draw_functions: Res<DrawFunctions<Shadow>>,
    prepass_pipeline: Res<PrepassPipeline<M>>,
-    render_meshes: Res<RenderAssets<RenderMesh>>,
-    render_mesh_instances: Res<RenderMeshInstances>,
-    render_materials: Res<RenderAssets<PreparedMaterial<M>>>,
-    render_material_instances: Res<RenderMaterialInstances<M>>,
+    (render_meshes, render_mesh_instances): (
+        Res<RenderAssets<RenderMesh>>,
+        Res<RenderMeshInstances>,
+    ),
+    (render_materials, render_material_instances): (
+        Res<RenderAssets<PreparedMaterial<M>>>,
+        Res<RenderMaterialInstances<M>>,
+    ),
    material_bind_group_allocator: Res<MaterialBindGroupAllocator<M>>,
    mut shadow_render_phases: ResMut<ViewBinnedRenderPhases<Shadow>>,
    mut pipelines: ResMut<SpecializedMeshPipelines<PrepassPipeline<M>>>,
    pipeline_cache: Res<PipelineCache>,
    render_lightmaps: Res<RenderLightmaps>,
+    mesh_allocator: Res<MeshAllocator>,
    view_lights: Query<(Entity, &ViewLightEntities)>,
    view_light_entities: Query<&LightEntity>,
    point_light_entities: Query<&RenderCubemapVisibleEntities, With<ExtractedPointLight>>,
@ -1624,10 +1647,17 @@ pub fn queue_shadows<M: Material>(
                    }
                };

+                let (vertex_slab, index_slab) =
+                    mesh_allocator.mesh_slabs(&mesh_instance.mesh_asset_id);
+
                shadow_phase.add(
                    ShadowBinKey {
-                        draw_function: draw_shadow_mesh,
-                        pipeline: pipeline_id,
+                        batch_set_key: ShadowBatchSetKey {
+                            pipeline: pipeline_id,
+                            draw_function: draw_shadow_mesh,
+                            vertex_slab: vertex_slab.unwrap_or_default(),
+                            index_slab,
+                        },
                        asset_id: mesh_instance.mesh_asset_id.into(),
                    },
                    (entity, main_entity),
@ -1645,19 +1675,52 @@ pub struct Shadow {
    pub extra_index: PhaseItemExtraIndex,
 }

-/// Data used to bin each object in the shadow map phase.
+/// Information that must be identical in order to place opaque meshes in the
+/// same *batch set*.
+///
+/// A batch set is a set of batches that can be multi-drawn together, if
+/// multi-draw is in use.
 #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub struct ShadowBinKey {
+pub struct ShadowBatchSetKey {
    /// The identifier of the render pipeline.
    pub pipeline: CachedRenderPipelineId,

    /// The function used to draw.
    pub draw_function: DrawFunctionId,

+    /// The ID of the slab of GPU memory that contains vertex data.
+    ///
+    /// For non-mesh items, you can fill this with 0 if your items can be
+    /// multi-drawn, or with a unique value if they can't.
+    pub vertex_slab: SlabId,
+
+    /// The ID of the slab of GPU memory that contains index data, if present.
+    ///
+    /// For non-mesh items, you can safely fill this with `None`.
+    pub index_slab: Option<SlabId>,
+}
+
+/// Data used to bin each object in the shadow map phase.
+#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct ShadowBinKey {
+    /// The key of the *batch set*.
+    ///
+    /// As batches belong to a batch set, meshes in a batch must obviously be
+    /// able to be placed in a single batch set.
+    pub batch_set_key: ShadowBatchSetKey,
+
    /// The object.
    pub asset_id: UntypedAssetId,
 }

+impl PhaseItemBinKey for ShadowBinKey {
+    type BatchSetKey = ShadowBatchSetKey;
+
+    fn get_batch_set_key(&self) -> Option<Self::BatchSetKey> {
+        Some(self.batch_set_key.clone())
+    }
+}
+
 impl PhaseItem for Shadow {
    #[inline]
    fn entity(&self) -> Entity {
@ -1670,7 +1733,7 @@ impl PhaseItem for Shadow {

    #[inline]
    fn draw_function(&self) -> DrawFunctionId {
-        self.key.draw_function
+        self.key.batch_set_key.draw_function
    }

    #[inline]
@ -1685,7 +1748,7 @@ impl PhaseItem for Shadow {

    #[inline]
    fn extra_index(&self) -> PhaseItemExtraIndex {
-        self.extra_index
+        self.extra_index.clone()
    }

    #[inline]
@ -1716,7 +1779,7 @@ impl BinnedPhaseItem for Shadow {
 impl CachedRenderPipelinePhaseItem for Shadow {
    #[inline]
    fn cached_pipeline(&self) -> CachedRenderPipelineId {
-        self.key.pipeline
+        self.key.batch_set_key.pipeline
    }
 }

--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@ -30,8 +30,8 @@ use bevy_render::{
    primitives::Aabb,
    render_asset::{ExtractAssetsSet, RenderAssets},
    render_phase::{
-        BinnedRenderPhasePlugin, PhaseItem, RenderCommand, RenderCommandResult,
-        SortedRenderPhasePlugin, TrackedRenderPass,
+        BinnedRenderPhasePlugin, PhaseItem, PhaseItemExtraIndex, RenderCommand,
+        RenderCommandResult, SortedRenderPhasePlugin, TrackedRenderPass,
    },
    render_resource::*,
    renderer::{RenderDevice, RenderQueue},
@ -193,8 +193,8 @@ impl Plugin for MeshRenderPlugin {

            let gpu_preprocessing_support =
                render_app.world().resource::<GpuPreprocessingSupport>();
-            let use_gpu_instance_buffer_builder = self.use_gpu_instance_buffer_builder
-                && *gpu_preprocessing_support != GpuPreprocessingSupport::None;
+            let use_gpu_instance_buffer_builder =
+                self.use_gpu_instance_buffer_builder && gpu_preprocessing_support.is_available();

            let render_mesh_instances = RenderMeshInstances::new(use_gpu_instance_buffer_builder);
            render_app.insert_resource(render_mesh_instances);
@ -2608,8 +2608,8 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {

        let mut dynamic_offsets: [u32; 3] = Default::default();
        let mut offset_count = 0;
-        if let Some(dynamic_offset) = item.extra_index().as_dynamic_offset() {
-            dynamic_offsets[offset_count] = dynamic_offset.get();
+        if let PhaseItemExtraIndex::DynamicOffset(dynamic_offset) = item.extra_index() {
+            dynamic_offsets[offset_count] = dynamic_offset;
            offset_count += 1;
        }
        if let Some(current_skin_index) = current_skin_index {
@ -2706,25 +2706,32 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
        };

        // Calculate the indirect offset, and look up the buffer.
-        let indirect_parameters = match item.extra_index().as_indirect_parameters_index() {
-            None => None,
-            Some(index) => match indirect_parameters_buffer.buffer() {
-                None => {
-                    warn!("Not rendering mesh because indirect parameters buffer wasn't present");
-                    return RenderCommandResult::Skip;
+        let indirect_parameters = match item.extra_index() {
+            PhaseItemExtraIndex::None | PhaseItemExtraIndex::DynamicOffset(_) => None,
+            PhaseItemExtraIndex::IndirectParametersIndex(indices) => {
+                match indirect_parameters_buffer.buffer() {
+                    None => {
+                        warn!(
+                            "Not rendering mesh because indirect parameters buffer wasn't present"
+                        );
+                        return RenderCommandResult::Skip;
+                    }
+                    Some(buffer) => Some((
+                        indices.start as u64 * size_of::<IndirectParameters>() as u64,
+                        indices.end - indices.start,
+                        buffer,
+                    )),
                }
-                Some(buffer) => Some((
-                    index as u64 * size_of::<IndirectParameters>() as u64,
-                    buffer,
-                )),
-            },
+            }
        };

        pass.set_vertex_buffer(0, vertex_buffer_slice.buffer.slice(..));

        let batch_range = item.batch_range();

-        // Draw either directly or indirectly, as appropriate.
+        // Draw either directly or indirectly, as appropriate. If we're in
+        // indirect mode, we can additionally multi-draw. (We can't multi-draw
+        // in direct mode because `wgpu` doesn't expose that functionality.)
        match &gpu_mesh.buffer_info {
            RenderMeshBufferInfo::Indexed {
                index_format,
@ -2746,19 +2753,33 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
                            batch_range.clone(),
                        );
                    }
-                    Some((indirect_parameters_offset, indirect_parameters_buffer)) => pass
-                        .draw_indexed_indirect(
+                    Some((
+                        indirect_parameters_offset,
+                        indirect_parameters_count,
+                        indirect_parameters_buffer,
+                    )) => {
+                        pass.multi_draw_indexed_indirect(
                            indirect_parameters_buffer,
                            indirect_parameters_offset,
-                        ),
+                            indirect_parameters_count,
+                        );
+                    }
                }
            }
            RenderMeshBufferInfo::NonIndexed => match indirect_parameters {
                None => {
                    pass.draw(vertex_buffer_slice.range, batch_range.clone());
                }
-                Some((indirect_parameters_offset, indirect_parameters_buffer)) => {
-                    pass.draw_indirect(indirect_parameters_buffer, indirect_parameters_offset);
+                Some((
+                    indirect_parameters_offset,
+                    indirect_parameters_count,
+                    indirect_parameters_buffer,
+                )) => {
+                    pass.multi_draw_indirect(
+                        indirect_parameters_buffer,
+                        indirect_parameters_offset,
+                        indirect_parameters_count,
+                    );
                }
            },
        }
--- a/crates/bevy_render/src/batching/gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/gpu_preprocessing.rs
@ -10,16 +10,17 @@ use bevy_ecs::{
    world::{FromWorld, World},
 };
 use bevy_encase_derive::ShaderType;
+use bevy_utils::tracing::error;
 use bytemuck::{Pod, Zeroable};
 use nonmax::NonMaxU32;
-use smallvec::smallvec;
 use wgpu::{BindingResource, BufferUsages, DownlevelFlags, Features};

 use crate::{
    render_phase::{
-        BinnedPhaseItem, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem,
-        PhaseItemExtraIndex, SortedPhaseItem, SortedRenderPhase, UnbatchableBinnedEntityIndices,
-        ViewBinnedRenderPhases, ViewSortedRenderPhases,
+        BinnedPhaseItem, BinnedRenderPhaseBatch, BinnedRenderPhaseBatchSets,
+        CachedRenderPipelinePhaseItem, PhaseItemBinKey as _, PhaseItemExtraIndex, SortedPhaseItem,
+        SortedRenderPhase, UnbatchableBinnedEntityIndices, ViewBinnedRenderPhases,
+        ViewSortedRenderPhases,
    },
    render_resource::{BufferVec, GpuArrayBufferable, RawBufferVec, UninitBufferVec},
    renderer::{RenderAdapter, RenderDevice, RenderQueue},
@ -64,12 +65,49 @@ impl Plugin for BatchingPlugin {
 ///
 /// [a `wgpu` limitation]: https://github.com/gfx-rs/wgpu/issues/2471
 #[derive(Clone, Copy, PartialEq, Resource)]
-pub enum GpuPreprocessingSupport {
-    /// No GPU preprocessing support is available at all.
+pub struct GpuPreprocessingSupport {
+    /// The maximum amount of GPU preprocessing available on this platform.
+    pub max_supported_mode: GpuPreprocessingMode,
+}
+
+impl GpuPreprocessingSupport {
+    /// Returns true if this GPU preprocessing support level isn't `None`.
+    #[inline]
+    pub fn is_available(&self) -> bool {
+        self.max_supported_mode != GpuPreprocessingMode::None
+    }
+
+    /// Returns the given GPU preprocessing mode, capped to the current
+    /// preprocessing mode.
+    pub fn min(&self, mode: GpuPreprocessingMode) -> GpuPreprocessingMode {
+        match (self.max_supported_mode, mode) {
+            (GpuPreprocessingMode::None, _) | (_, GpuPreprocessingMode::None) => {
+                GpuPreprocessingMode::None
+            }
+            (mode, GpuPreprocessingMode::Culling) | (GpuPreprocessingMode::Culling, mode) => mode,
+            (GpuPreprocessingMode::PreprocessingOnly, GpuPreprocessingMode::PreprocessingOnly) => {
+                GpuPreprocessingMode::PreprocessingOnly
+            }
+        }
+    }
+}
+
+/// The amount of GPU preprocessing (compute and indirect draw) that we do.
+#[derive(Clone, Copy, PartialEq)]
+pub enum GpuPreprocessingMode {
+    /// No GPU preprocessing is in use at all.
+    ///
+    /// This is used when GPU compute isn't available.
    None,
-    /// GPU preprocessing is available, but GPU culling isn't.
+
+    /// GPU preprocessing is in use, but GPU culling isn't.
+    ///
+    /// This is used by default.
    PreprocessingOnly,
-    /// Both GPU preprocessing and GPU culling are available.
+
+    /// Both GPU preprocessing and GPU culling are in use.
+    ///
+    /// This is used when the [`GpuCulling`] component is present on the camera.
    Culling,
 }

@ -301,19 +339,21 @@ impl FromWorld for GpuPreprocessingSupport {
            }
        }

-        if device.limits().max_compute_workgroup_size_x == 0 || is_non_supported_android_device(adapter)
+        let max_supported_mode = if device.limits().max_compute_workgroup_size_x == 0 || is_non_supported_android_device(adapter)
        {
-            GpuPreprocessingSupport::None
+            GpuPreprocessingMode::None
        } else if !device
            .features()
            .contains(Features::INDIRECT_FIRST_INSTANCE) ||
            !adapter.get_downlevel_capabilities().flags.contains(
        DownlevelFlags::VERTEX_AND_INSTANCE_INDEX_RESPECTS_RESPECTIVE_FIRST_VALUE_IN_INDIRECT_DRAW)
        {
-            GpuPreprocessingSupport::PreprocessingOnly
+            GpuPreprocessingMode::PreprocessingOnly
        } else {
-            GpuPreprocessingSupport::Culling
-        }
+            GpuPreprocessingMode::Culling
+        };
+
+        GpuPreprocessingSupport { max_supported_mode }
    }
 }

@ -600,6 +640,13 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(

        // Prepare batchables.

+        // If multi-draw is in use, as we step through the list of batchables,
+        // we gather adjacent batches that have the same *batch set* key into
+        // batch sets. This variable stores the last batch set key that we've
+        // seen. If our current batch set key is identical to this one, we can
+        // merge the current batch into the last batch set.
+        let mut last_multidraw_key = None;
+
        for key in &phase.batchable_mesh_keys {
            let mut batch: Option<BinnedRenderPhaseBatch> = None;
            for &(entity, main_entity) in &phase.batchable_mesh_values[key] {
@ -615,10 +662,13 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                        batch.instance_range.end = output_index + 1;
                        work_item_buffer.buffer.push(PreprocessWorkItem {
                            input_index: input_index.into(),
-                            output_index: batch
-                                .extra_index
-                                .as_indirect_parameters_index()
-                                .unwrap_or(output_index),
+                            output_index: match batch.extra_index {
+                                PhaseItemExtraIndex::IndirectParametersIndex(ref range) => {
+                                    range.start
+                                }
+                                PhaseItemExtraIndex::DynamicOffset(_)
+                                | PhaseItemExtraIndex::None => output_index,
+                            },
                        });
                    }

@ -650,14 +700,33 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                        batch = Some(BinnedRenderPhaseBatch {
                            representative_entity: (entity, main_entity),
                            instance_range: output_index..output_index + 1,
-                            extra_index: PhaseItemExtraIndex::NONE,
+                            extra_index: PhaseItemExtraIndex::None,
                        });
                    }
                }
            }

            if let Some(batch) = batch {
-                phase.batch_sets.push(smallvec![batch]);
+                match phase.batch_sets {
+                    BinnedRenderPhaseBatchSets::DynamicUniforms(_) => {
+                        error!("Dynamic uniform batch sets shouldn't be used here");
+                    }
+                    BinnedRenderPhaseBatchSets::Direct(ref mut vec) => {
+                        vec.push(batch);
+                    }
+                    BinnedRenderPhaseBatchSets::MultidrawIndirect(ref mut batch_sets) => {
+                        // We're in multi-draw mode. Check to see whether our
+                        // batch set key is the same as the last one. If so,
+                        // merge this batch into the preceding batch set.
+                        let this_multidraw_key = key.get_batch_set_key();
+                        if last_multidraw_key.as_ref() == Some(&this_multidraw_key) {
+                            batch_sets.last_mut().unwrap().push(batch);
+                        } else {
+                            last_multidraw_key = Some(this_multidraw_key);
+                            batch_sets.push(vec![batch]);
+                        }
+                    }
+                }
            }
        }

@ -688,8 +757,9 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                        .buffer_indices
                        .add(UnbatchableBinnedEntityIndices {
                            instance_index: indirect_parameters_index.into(),
-                            extra_index: PhaseItemExtraIndex::indirect_parameters_index(
-                                indirect_parameters_index.into(),
+                            extra_index: PhaseItemExtraIndex::IndirectParametersIndex(
+                                u32::from(indirect_parameters_index)
+                                    ..(u32::from(indirect_parameters_index) + 1),
                            ),
                        });
                } else {
@ -701,7 +771,7 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                        .buffer_indices
                        .add(UnbatchableBinnedEntityIndices {
                            instance_index: output_index,
-                            extra_index: PhaseItemExtraIndex::NONE,
+                            extra_index: PhaseItemExtraIndex::None,
                        });
                }
            }
--- a/crates/bevy_render/src/batching/mod.rs
+++ b/crates/bevy_render/src/batching/mod.rs
@ -7,7 +7,7 @@ use bytemuck::Pod;
 use nonmax::NonMaxU32;

 use self::gpu_preprocessing::IndirectParametersBuffer;
-use crate::sync_world::MainEntity;
+use crate::{render_phase::PhaseItemExtraIndex, sync_world::MainEntity};
 use crate::{
    render_phase::{
        BinnedPhaseItem, CachedRenderPipelinePhaseItem, DrawFunctionId, SortedPhaseItem,
@ -54,7 +54,12 @@ impl<T: PartialEq> BatchMeta<T> {
        BatchMeta {
            pipeline_id: item.cached_pipeline(),
            draw_function_id: item.draw_function(),
-            dynamic_offset: item.extra_index().as_dynamic_offset(),
+            dynamic_offset: match item.extra_index() {
+                PhaseItemExtraIndex::DynamicOffset(dynamic_offset) => {
+                    NonMaxU32::new(dynamic_offset)
+                }
+                PhaseItemExtraIndex::None | PhaseItemExtraIndex::IndirectParametersIndex(_) => None,
+            },
            user_data,
        }
    }
--- a/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
+++ b/crates/bevy_render/src/batching/no_gpu_preprocessing.rs
@ -2,13 +2,15 @@

 use bevy_derive::{Deref, DerefMut};
 use bevy_ecs::system::{Res, ResMut, Resource, StaticSystemParam};
+use bevy_utils::tracing::error;
 use smallvec::{smallvec, SmallVec};
 use wgpu::BindingResource;

 use crate::{
    render_phase::{
-        BinnedPhaseItem, BinnedRenderPhaseBatch, CachedRenderPipelinePhaseItem,
-        PhaseItemExtraIndex, SortedPhaseItem, ViewBinnedRenderPhases, ViewSortedRenderPhases,
+        BinnedPhaseItem, BinnedRenderPhaseBatch, BinnedRenderPhaseBatchSets,
+        CachedRenderPipelinePhaseItem, PhaseItemExtraIndex, SortedPhaseItem,
+        ViewBinnedRenderPhases, ViewSortedRenderPhases,
    },
    render_resource::{GpuArrayBuffer, GpuArrayBufferable},
    renderer::{RenderDevice, RenderQueue},
@ -138,7 +140,17 @@ pub fn batch_and_prepare_binned_render_phase<BPI, GFBD>(
                }
            }

-            phase.batch_sets.push(batch_set);
+            match phase.batch_sets {
+                BinnedRenderPhaseBatchSets::DynamicUniforms(ref mut batch_sets) => {
+                    batch_sets.push(batch_set);
+                }
+                BinnedRenderPhaseBatchSets::Direct(_)
+                | BinnedRenderPhaseBatchSets::MultidrawIndirect(_) => {
+                    error!(
+                        "Dynamic uniform batch sets should be used when GPU preprocessing is off"
+                    );
+                }
+            }
        }

        // Prepare unbatchables.
--- a/crates/bevy_render/src/camera/camera.rs
+++ b/crates/bevy_render/src/camera/camera.rs
@ -1,13 +1,12 @@
 use super::{ClearColorConfig, Projection};
 use crate::{
-    batching::gpu_preprocessing::GpuPreprocessingSupport,
+    batching::gpu_preprocessing::{GpuPreprocessingMode, GpuPreprocessingSupport},
    camera::{CameraProjection, ManualTextureViewHandle, ManualTextureViews},
    primitives::Frustum,
    render_asset::RenderAssets,
    render_graph::{InternedRenderSubGraph, RenderSubGraph},
    render_resource::TextureView,
-    sync_world::TemporaryRenderEntity,
-    sync_world::{RenderEntity, SyncToRenderWorld},
+    sync_world::{RenderEntity, SyncToRenderWorld, TemporaryRenderEntity},
    texture::GpuImage,
    view::{
        ColorGrading, ExtractedView, ExtractedWindows, GpuCulling, Msaa, RenderLayers,
@ -1156,8 +1155,9 @@ pub fn extract_cameras(
            if let Some(perspective) = projection {
                commands.insert(perspective.clone());
            }
+
            if gpu_culling {
-                if *gpu_preprocessing_support == GpuPreprocessingSupport::Culling {
+                if gpu_preprocessing_support.max_supported_mode == GpuPreprocessingMode::Culling {
                    commands.insert(GpuCulling);
                } else {
                    warn_once!(
--- a/crates/bevy_render/src/mesh/allocator.rs
+++ b/crates/bevy_render/src/mesh/allocator.rs
@ -5,6 +5,7 @@ use core::{
    fmt::{self, Display, Formatter},
    ops::Range,
 };
+use nonmax::NonMaxU32;

 use bevy_app::{App, Plugin};
 use bevy_asset::AssetId;
@ -15,6 +16,7 @@ use bevy_ecs::{
    world::{FromWorld, World},
 };
 use bevy_utils::{
+    default,
    hashbrown::{HashMap, HashSet},
    tracing::error,
 };
@ -152,9 +154,9 @@ pub struct MeshBufferSlice<'a> {
 }

 /// The index of a single slab.
-#[derive(Clone, Copy, Default, PartialEq, Eq, Hash, Debug)]
+#[derive(Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
 #[repr(transparent)]
-struct SlabId(u32);
+pub struct SlabId(pub NonMaxU32);

 /// Data for a single slab.
 #[allow(clippy::large_enum_variant)]
@ -331,7 +333,7 @@ impl FromWorld for MeshAllocator {
            slab_layouts: HashMap::new(),
            mesh_id_to_vertex_slab: HashMap::new(),
            mesh_id_to_index_slab: HashMap::new(),
-            next_slab_id: SlabId(0),
+            next_slab_id: default(),
            general_vertex_slabs_supported,
        }
    }
@ -377,6 +379,19 @@ impl MeshAllocator {
        self.mesh_slice_in_slab(mesh_id, *self.mesh_id_to_index_slab.get(mesh_id)?)
    }

+    /// Returns the IDs of the vertex buffer and index buffer respectively for
+    /// the mesh with the given ID.
+    ///
+    /// If the mesh wasn't allocated, or has no index data in the case of the
+    /// index buffer, the corresponding element in the returned tuple will be
+    /// None.
+    pub fn mesh_slabs(&self, mesh_id: &AssetId<Mesh>) -> (Option<SlabId>, Option<SlabId>) {
+        (
+            self.mesh_id_to_vertex_slab.get(mesh_id).cloned(),
+            self.mesh_id_to_index_slab.get(mesh_id).cloned(),
+        )
+    }
+
    /// Given a slab and a mesh with data located with it, returns the buffer
    /// and range of that mesh data within the slab.
    fn mesh_slice_in_slab(
@ -713,7 +728,7 @@ impl MeshAllocator {
        // If we still have no allocation, make a new slab.
        if mesh_allocation.is_none() {
            let new_slab_id = self.next_slab_id;
-            self.next_slab_id.0 += 1;
+            self.next_slab_id.0 = NonMaxU32::new(self.next_slab_id.0.get() + 1).unwrap_or_default();

            let new_slab = GeneralSlab::new(
                new_slab_id,
@ -747,7 +762,7 @@ impl MeshAllocator {
    /// Allocates an object into its own dedicated slab.
    fn allocate_large(&mut self, mesh_id: &AssetId<Mesh>, layout: ElementLayout) {
        let new_slab_id = self.next_slab_id;
-        self.next_slab_id.0 += 1;
+        self.next_slab_id.0 = NonMaxU32::new(self.next_slab_id.0.get() + 1).unwrap_or_default();

        self.record_allocation(mesh_id, new_slab_id, layout.class);

--- a/crates/bevy_render/src/render_phase/mod.rs
+++ b/crates/bevy_render/src/render_phase/mod.rs
@ -37,6 +37,7 @@ use encase::{internal::WriteInto, ShaderSize};
 use nonmax::NonMaxU32;
 pub use rangefinder::*;

+use crate::batching::gpu_preprocessing::GpuPreprocessingMode;
 use crate::sync_world::MainEntity;
 use crate::{
    batching::{
@ -53,14 +54,7 @@ use bevy_ecs::{
    prelude::*,
    system::{lifetimeless::SRes, SystemParamItem},
 };
-use core::{
-    fmt::{self, Debug, Formatter},
-    hash::Hash,
-    iter,
-    marker::PhantomData,
-    ops::Range,
-    slice::SliceIndex,
-};
+use core::{fmt::Debug, hash::Hash, iter, marker::PhantomData, ops::Range, slice::SliceIndex};
 use smallvec::SmallVec;

 /// Stores the rendering instructions for a single phase that uses bins in all
@ -133,7 +127,38 @@ where
    ///
    /// The unbatchable entities immediately follow the batches in the storage
    /// buffers.
-    pub(crate) batch_sets: Vec<SmallVec<[BinnedRenderPhaseBatch; 1]>>,
+    pub(crate) batch_sets: BinnedRenderPhaseBatchSets,
+}
+
+/// How we store and render the batch sets.
+///
+/// Each one of these corresponds to a [`GpuPreprocessingMode`].
+pub enum BinnedRenderPhaseBatchSets {
+    /// Batches are grouped into batch sets based on dynamic uniforms.
+    ///
+    /// This corresponds to [`GpuPreprocessingMode::None`].
+    DynamicUniforms(Vec<SmallVec<[BinnedRenderPhaseBatch; 1]>>),
+
+    /// Batches are never grouped into batch sets.
+    ///
+    /// This corresponds to [`GpuPreprocessingMode::PreprocessingOnly`].
+    Direct(Vec<BinnedRenderPhaseBatch>),
+
+    /// Batches are grouped together into batch sets based on their ability to
+    /// be multi-drawn together.
+    ///
+    /// This corresponds to [`GpuPreprocessingMode::Culling`].
+    MultidrawIndirect(Vec<Vec<BinnedRenderPhaseBatch>>),
+}
+
+impl BinnedRenderPhaseBatchSets {
+    fn clear(&mut self) {
+        match *self {
+            BinnedRenderPhaseBatchSets::DynamicUniforms(ref mut vec) => vec.clear(),
+            BinnedRenderPhaseBatchSets::Direct(ref mut vec) => vec.clear(),
+            BinnedRenderPhaseBatchSets::MultidrawIndirect(ref mut vec) => vec.clear(),
+        }
+    }
 }

 /// Information about a single batch of entities rendered using binned phase
@ -200,7 +225,7 @@ pub(crate) enum UnbatchableBinnedEntityIndexSet {
 /// The instance index and dynamic offset (if present) for an unbatchable entity.
 ///
 /// This is only useful on platforms that don't support storage buffers.
-#[derive(Clone, Copy)]
+#[derive(Clone)]
 pub(crate) struct UnbatchableBinnedEntityIndices {
    /// The instance index.
    pub(crate) instance_index: u32,
@ -257,11 +282,11 @@ impl<BPI> ViewBinnedRenderPhases<BPI>
 where
    BPI: BinnedPhaseItem,
 {
-    pub fn insert_or_clear(&mut self, entity: Entity) {
+    pub fn insert_or_clear(&mut self, entity: Entity, gpu_preprocessing: GpuPreprocessingMode) {
        match self.entry(entity) {
            Entry::Occupied(mut entry) => entry.get_mut().clear(),
            Entry::Vacant(entry) => {
-                entry.insert(default());
+                entry.insert(BinnedRenderPhase::<BPI>::new(gpu_preprocessing));
            }
        }
    }
@ -345,24 +370,87 @@ where
        let draw_functions = world.resource::<DrawFunctions<BPI>>();
        let mut draw_functions = draw_functions.write();

-        debug_assert_eq!(self.batchable_mesh_keys.len(), self.batch_sets.len());
+        match self.batch_sets {
+            BinnedRenderPhaseBatchSets::DynamicUniforms(ref batch_sets) => {
+                debug_assert_eq!(self.batchable_mesh_keys.len(), batch_sets.len());

-        for (key, batch_set) in self.batchable_mesh_keys.iter().zip(self.batch_sets.iter()) {
-            for batch in batch_set {
-                let binned_phase_item = BPI::new(
-                    key.clone(),
-                    batch.representative_entity,
-                    batch.instance_range.clone(),
-                    batch.extra_index,
-                );
+                for (key, batch_set) in self.batchable_mesh_keys.iter().zip(batch_sets.iter()) {
+                    for batch in batch_set {
+                        let binned_phase_item = BPI::new(
+                            key.clone(),
+                            batch.representative_entity,
+                            batch.instance_range.clone(),
+                            batch.extra_index.clone(),
+                        );

-                // Fetch the draw function.
-                let Some(draw_function) = draw_functions.get_mut(binned_phase_item.draw_function())
-                else {
-                    continue;
-                };
+                        // Fetch the draw function.
+                        let Some(draw_function) =
+                            draw_functions.get_mut(binned_phase_item.draw_function())
+                        else {
+                            continue;
+                        };

-                draw_function.draw(world, render_pass, view, &binned_phase_item)?;
+                        draw_function.draw(world, render_pass, view, &binned_phase_item)?;
+                    }
+                }
+            }
+
+            BinnedRenderPhaseBatchSets::Direct(ref batch_set) => {
+                for (batch, key) in batch_set.iter().zip(self.batchable_mesh_keys.iter()) {
+                    let binned_phase_item = BPI::new(
+                        key.clone(),
+                        batch.representative_entity,
+                        batch.instance_range.clone(),
+                        batch.extra_index.clone(),
+                    );
+
+                    // Fetch the draw function.
+                    let Some(draw_function) =
+                        draw_functions.get_mut(binned_phase_item.draw_function())
+                    else {
+                        continue;
+                    };
+
+                    draw_function.draw(world, render_pass, view, &binned_phase_item)?;
+                }
+            }
+
+            BinnedRenderPhaseBatchSets::MultidrawIndirect(ref batch_sets) => {
+                let mut batchable_mesh_key_index = 0;
+                for batch_set in batch_sets.iter() {
+                    let Some(batch) = batch_set.first() else {
+                        continue;
+                    };
+
+                    let key = &self.batchable_mesh_keys[batchable_mesh_key_index];
+                    batchable_mesh_key_index += batch_set.len();
+
+                    let binned_phase_item = BPI::new(
+                        key.clone(),
+                        batch.representative_entity,
+                        batch.instance_range.clone(),
+                        match batch.extra_index {
+                            PhaseItemExtraIndex::None => PhaseItemExtraIndex::None,
+                            PhaseItemExtraIndex::DynamicOffset(ref dynamic_offset) => {
+                                PhaseItemExtraIndex::DynamicOffset(*dynamic_offset)
+                            }
+                            PhaseItemExtraIndex::IndirectParametersIndex(ref range) => {
+                                PhaseItemExtraIndex::IndirectParametersIndex(
+                                    range.start..(range.start + batch_set.len() as u32),
+                                )
+                            }
+                        },
+                    );
+
+                    // Fetch the draw function.
+                    let Some(draw_function) =
+                        draw_functions.get_mut(binned_phase_item.draw_function())
+                    else {
+                        continue;
+                    };
+
+                    draw_function.draw(world, render_pass, view, &binned_phase_item)?;
+                }
            }
        }

@ -393,17 +481,20 @@ where
                    } => UnbatchableBinnedEntityIndices {
                        instance_index: instance_range.start + entity_index as u32,
                        extra_index: match first_indirect_parameters_index {
-                            None => PhaseItemExtraIndex::NONE,
+                            None => PhaseItemExtraIndex::None,
                            Some(first_indirect_parameters_index) => {
-                                PhaseItemExtraIndex::indirect_parameters_index(
+                                let first_indirect_parameters_index_for_entity =
                                    u32::from(*first_indirect_parameters_index)
-                                        + entity_index as u32,
+                                        + entity_index as u32;
+                                PhaseItemExtraIndex::IndirectParametersIndex(
+                                    first_indirect_parameters_index_for_entity
+                                        ..(first_indirect_parameters_index_for_entity + 1),
                                )
                            }
                        },
                    },
                    UnbatchableBinnedEntityIndexSet::Dense(ref dynamic_offsets) => {
-                        dynamic_offsets[entity_index]
+                        dynamic_offsets[entity_index].clone()
                    }
                };

@ -442,7 +533,7 @@ where
        for &(ref key, entity) in &self.non_mesh_items {
            // Come up with a fake batch range and extra index. The draw
            // function is expected to manage any sort of batching logic itself.
-            let binned_phase_item = BPI::new(key.clone(), entity, 0..1, PhaseItemExtraIndex(0));
+            let binned_phase_item = BPI::new(key.clone(), entity, 0..1, PhaseItemExtraIndex::None);

            let Some(draw_function) = draw_functions.get_mut(binned_phase_item.draw_function())
            else {
@ -471,18 +562,26 @@ where
    }
 }

-impl<BPI> Default for BinnedRenderPhase<BPI>
+impl<BPI> BinnedRenderPhase<BPI>
 where
    BPI: BinnedPhaseItem,
 {
-    fn default() -> Self {
+    fn new(gpu_preprocessing: GpuPreprocessingMode) -> Self {
        Self {
            batchable_mesh_keys: vec![],
            batchable_mesh_values: HashMap::default(),
            unbatchable_mesh_keys: vec![],
            unbatchable_mesh_values: HashMap::default(),
            non_mesh_items: vec![],
-            batch_sets: vec![],
+            batch_sets: match gpu_preprocessing {
+                GpuPreprocessingMode::Culling => {
+                    BinnedRenderPhaseBatchSets::MultidrawIndirect(vec![])
+                }
+                GpuPreprocessingMode::PreprocessingOnly => {
+                    BinnedRenderPhaseBatchSets::Direct(vec![])
+                }
+                GpuPreprocessingMode::None => BinnedRenderPhaseBatchSets::DynamicUniforms(vec![]),
+            },
        }
    }
 }
@ -505,19 +604,24 @@ impl UnbatchableBinnedEntityIndexSet {
                first_indirect_parameters_index: None,
            } => Some(UnbatchableBinnedEntityIndices {
                instance_index: instance_range.start + entity_index,
-                extra_index: PhaseItemExtraIndex::NONE,
+                extra_index: PhaseItemExtraIndex::None,
            }),
            UnbatchableBinnedEntityIndexSet::Sparse {
                instance_range,
                first_indirect_parameters_index: Some(first_indirect_parameters_index),
-            } => Some(UnbatchableBinnedEntityIndices {
-                instance_index: instance_range.start + entity_index,
-                extra_index: PhaseItemExtraIndex::indirect_parameters_index(
-                    u32::from(*first_indirect_parameters_index) + entity_index,
-                ),
-            }),
+            } => {
+                let first_indirect_parameters_index_for_this_batch =
+                    u32::from(*first_indirect_parameters_index) + entity_index;
+                Some(UnbatchableBinnedEntityIndices {
+                    instance_index: instance_range.start + entity_index,
+                    extra_index: PhaseItemExtraIndex::IndirectParametersIndex(
+                        first_indirect_parameters_index_for_this_batch
+                            ..(first_indirect_parameters_index_for_this_batch + 1),
+                    ),
+                })
+            }
            UnbatchableBinnedEntityIndexSet::Dense(ref indices) => {
-                indices.get(entity_index as usize).copied()
+                indices.get(entity_index as usize).cloned()
            }
        }
    }
@ -661,19 +765,27 @@ impl UnbatchableBinnedEntityIndexSet {
    pub fn add(&mut self, indices: UnbatchableBinnedEntityIndices) {
        match self {
            UnbatchableBinnedEntityIndexSet::NoEntities => {
-                if indices.extra_index.is_dynamic_offset() {
-                    // This is the first entity we've seen, and we don't have
-                    // compute shaders. Initialize an array.
-                    *self = UnbatchableBinnedEntityIndexSet::Dense(vec![indices]);
-                } else {
-                    // This is the first entity we've seen, and we have compute
-                    // shaders. Initialize the fast path.
-                    *self = UnbatchableBinnedEntityIndexSet::Sparse {
-                        instance_range: indices.instance_index..indices.instance_index + 1,
-                        first_indirect_parameters_index: indices
-                            .extra_index
-                            .as_indirect_parameters_index()
-                            .and_then(|index| NonMaxU32::try_from(index).ok()),
+                match indices.extra_index {
+                    PhaseItemExtraIndex::DynamicOffset(_) => {
+                        // This is the first entity we've seen, and we don't have
+                        // compute shaders. Initialize an array.
+                        *self = UnbatchableBinnedEntityIndexSet::Dense(vec![indices]);
+                    }
+                    PhaseItemExtraIndex::None => {
+                        // This is the first entity we've seen, and we have compute
+                        // shaders. Initialize the fast path.
+                        *self = UnbatchableBinnedEntityIndexSet::Sparse {
+                            instance_range: indices.instance_index..indices.instance_index + 1,
+                            first_indirect_parameters_index: None,
+                        }
+                    }
+                    PhaseItemExtraIndex::IndirectParametersIndex(ref range) => {
+                        // This is the first entity we've seen, and we have compute
+                        // shaders. Initialize the fast path.
+                        *self = UnbatchableBinnedEntityIndexSet::Sparse {
+                            instance_range: indices.instance_index..indices.instance_index + 1,
+                            first_indirect_parameters_index: NonMaxU32::new(range.start),
+                        }
                    }
                }
            }
@ -683,13 +795,17 @@ impl UnbatchableBinnedEntityIndexSet {
                first_indirect_parameters_index,
            } if instance_range.end == indices.instance_index
                && ((first_indirect_parameters_index.is_none()
-                    && indices.extra_index == PhaseItemExtraIndex::NONE)
+                    && indices.extra_index == PhaseItemExtraIndex::None)
                    || first_indirect_parameters_index.is_some_and(
-                        |first_indirect_parameters_index| {
-                            Some(
+                        |first_indirect_parameters_index| match indices.extra_index {
+                            PhaseItemExtraIndex::IndirectParametersIndex(ref this_range) => {
                                u32::from(first_indirect_parameters_index) + instance_range.end
-                                    - instance_range.start,
-                            ) == indices.extra_index.as_indirect_parameters_index()
+                                    - instance_range.start
+                                    == this_range.start
+                            }
+                            PhaseItemExtraIndex::DynamicOffset(_) | PhaseItemExtraIndex::None => {
+                                false
+                            }
                        },
                    )) =>
            {
@ -891,114 +1007,40 @@ pub trait PhaseItem: Sized + Send + Sync + 'static {
 /// Note that our indirect draw functionality requires storage buffers, so it's
 /// impossible to have both a dynamic offset and an indirect parameters index.
 /// This convenient fact allows us to pack both indices into a single `u32`.
-#[derive(Clone, Copy, PartialEq, Eq, Hash)]
-pub struct PhaseItemExtraIndex(pub u32);
-
-impl Debug for PhaseItemExtraIndex {
-    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
-        if self.is_dynamic_offset() {
-            write!(f, "DynamicOffset({})", self.offset())
-        } else if self.is_indirect_parameters_index() {
-            write!(f, "IndirectParametersIndex({})", self.offset())
-        } else {
-            write!(f, "None")
-        }
-    }
+#[derive(Clone, PartialEq, Eq, Hash, Debug)]
+pub enum PhaseItemExtraIndex {
+    /// No extra index is present.
+    None,
+    /// A `wgpu` dynamic offset into the uniform buffer of instance data. This
+    /// is used on platforms that don't support storage buffers, to work around
+    /// uniform buffer size limitations.
+    DynamicOffset(u32),
+    /// An index into the buffer that specifies the indirect parameters for this
+    /// [`PhaseItem`]'s drawcall. This is used when indirect mode is on (as used
+    /// for GPU culling).
+    IndirectParametersIndex(Range<u32>),
 }

 impl PhaseItemExtraIndex {
-    /// The flag that indicates that this index is an indirect parameter. If not
-    /// set, this is a dynamic offset.
-    pub const INDIRECT_PARAMETER_INDEX: u32 = 1 << 31;
-    /// To extract the index from a packed [`PhaseItemExtraIndex`], bitwise-and
-    /// the contents with this value.
-    pub const OFFSET_MASK: u32 = Self::INDIRECT_PARAMETER_INDEX - 1;
-    /// To extract the flag from a packed [`PhaseItemExtraIndex`], bitwise-and
-    /// the contents with this value.
-    pub const FLAGS_MASK: u32 = !Self::OFFSET_MASK;
-
-    /// The special value that indicates that no extra index is present.
-    pub const NONE: PhaseItemExtraIndex = PhaseItemExtraIndex(u32::MAX);
-
-    /// Returns either the indirect parameters index or the dynamic offset,
-    /// depending on which is in use.
-    #[inline]
-    fn offset(&self) -> u32 {
-        self.0 & Self::OFFSET_MASK
-    }
-
-    /// Determines whether this extra index is a dynamic offset.
-    #[inline]
-    fn is_dynamic_offset(&self) -> bool {
-        *self != Self::NONE && (self.0 & Self::INDIRECT_PARAMETER_INDEX) == 0
-    }
-
-    /// Determines whether this extra index is an indirect parameters index.
-    #[inline]
-    fn is_indirect_parameters_index(&self) -> bool {
-        *self != Self::NONE && (self.0 & Self::INDIRECT_PARAMETER_INDEX) != 0
-    }
-
-    /// Packs a indirect parameters index into this extra index.
-    #[inline]
-    pub fn indirect_parameters_index(indirect_parameter_index: u32) -> PhaseItemExtraIndex {
-        // Make sure we didn't overflow.
-        debug_assert_eq!(indirect_parameter_index & Self::FLAGS_MASK, 0);
-        PhaseItemExtraIndex(indirect_parameter_index | Self::INDIRECT_PARAMETER_INDEX)
-    }
-
    /// Returns either an indirect parameters index or
-    /// [`PhaseItemExtraIndex::NONE`], as appropriate.
-    #[inline]
+    /// [`PhaseItemExtraIndex::None`], as appropriate.
    pub fn maybe_indirect_parameters_index(
-        maybe_indirect_parameters_index: Option<NonMaxU32>,
+        indirect_parameters_index: Option<NonMaxU32>,
    ) -> PhaseItemExtraIndex {
-        match maybe_indirect_parameters_index {
-            Some(indirect_parameters_index) => {
-                Self::indirect_parameters_index(indirect_parameters_index.into())
-            }
-            None => PhaseItemExtraIndex::NONE,
+        match indirect_parameters_index {
+            Some(indirect_parameters_index) => PhaseItemExtraIndex::IndirectParametersIndex(
+                u32::from(indirect_parameters_index)..(u32::from(indirect_parameters_index) + 1),
+            ),
+            None => PhaseItemExtraIndex::None,
        }
    }

-    /// Packs a dynamic offset into this extra index.
-    #[inline]
-    pub fn dynamic_offset(dynamic_offset: u32) -> PhaseItemExtraIndex {
-        // Make sure we didn't overflow.
-        debug_assert_eq!(dynamic_offset & Self::FLAGS_MASK, 0);
-
-        PhaseItemExtraIndex(dynamic_offset)
-    }
-
-    /// Returns either a dynamic offset or [`PhaseItemExtraIndex::NONE`], as
-    /// appropriate.
-    #[inline]
-    pub fn maybe_dynamic_offset(maybe_dynamic_offset: Option<NonMaxU32>) -> PhaseItemExtraIndex {
-        match maybe_dynamic_offset {
-            Some(dynamic_offset) => Self::dynamic_offset(dynamic_offset.into()),
-            None => PhaseItemExtraIndex::NONE,
-        }
-    }
-
-    /// If this extra index describes a dynamic offset, returns it; otherwise,
-    /// returns `None`.
-    #[inline]
-    pub fn as_dynamic_offset(&self) -> Option<NonMaxU32> {
-        if self.is_dynamic_offset() {
-            NonMaxU32::try_from(self.0 & Self::OFFSET_MASK).ok()
-        } else {
-            None
-        }
-    }
-
-    /// If this extra index describes an indirect parameters index, returns it;
-    /// otherwise, returns `None`.
-    #[inline]
-    pub fn as_indirect_parameters_index(&self) -> Option<u32> {
-        if self.is_indirect_parameters_index() {
-            Some(self.0 & Self::OFFSET_MASK)
-        } else {
-            None
+    /// Returns either a dynamic offset index or [`PhaseItemExtraIndex::None`],
+    /// as appropriate.
+    pub fn maybe_dynamic_offset(dynamic_offset: Option<NonMaxU32>) -> PhaseItemExtraIndex {
+        match dynamic_offset {
+            Some(dynamic_offset) => PhaseItemExtraIndex::DynamicOffset(dynamic_offset.into()),
+            None => PhaseItemExtraIndex::None,
        }
    }
 }
@ -1017,7 +1059,7 @@ pub trait BinnedPhaseItem: PhaseItem {
    /// lowest variable bind group id such as the material bind group id, and
    /// its dynamic offsets if any, next bind group and offsets, etc. This
    /// reduces the need for rebinding between bins and improves performance.
-    type BinKey: Clone + Send + Sync + Eq + Ord + Hash;
+    type BinKey: PhaseItemBinKey;

    /// Creates a new binned phase item from the key and per-entity data.
    ///
@ -1032,6 +1074,26 @@ pub trait BinnedPhaseItem: PhaseItem {
    ) -> Self;
 }

+/// A trait that allows fetching the *batch set key* from a bin key.
+///
+/// A *batch set* is a set of mesh batches that will be rendered with multi-draw
+/// if multi-draw is in use. The *batch set key* is the data that has to be
+/// identical between meshes in order to place them in the same batch set. A
+/// batch set can therefore span multiple bins.
+///
+/// The batch set key should be at the beginning of the bin key structure so
+/// that batches in the same batch set will be adjacent to one another in the
+/// sorted list of bins.
+pub trait PhaseItemBinKey: Clone + Send + Sync + PartialEq + Eq + Ord + Hash {
+    type BatchSetKey: Clone + PartialEq;
+
+    /// Returns the batch set key, if applicable.
+    ///
+    /// If this returns `None`, no batches in this phase item can be grouped
+    /// together into batch sets.
+    fn get_batch_set_key(&self) -> Option<Self::BatchSetKey>;
+}
+
 /// Represents phase items that must be sorted. The `SortKey` specifies the
 /// order that these items are drawn in. These are placed into a single array,
 /// and the array as a whole is then sorted.
--- a/crates/bevy_sprite/src/mesh2d/material.rs
+++ b/crates/bevy_sprite/src/mesh2d/material.rs
@ -602,7 +602,7 @@ pub fn queue_material2d_meshes<M: Material2d>(
                        sort_key: FloatOrd(mesh_z + material_2d.properties.depth_bias),
                        // Batching is done in batch_and_prepare_render_phase
                        batch_range: 0..1,
-                        extra_index: PhaseItemExtraIndex::NONE,
+                        extra_index: PhaseItemExtraIndex::None,
                    });
                }
            }
--- a/crates/bevy_sprite/src/mesh2d/mesh.rs
+++ b/crates/bevy_sprite/src/mesh2d/mesh.rs
@ -31,7 +31,9 @@ use bevy_render::{
        RenderMeshBufferInfo,
    },
    render_asset::RenderAssets,
-    render_phase::{PhaseItem, RenderCommand, RenderCommandResult, TrackedRenderPass},
+    render_phase::{
+        PhaseItem, PhaseItemExtraIndex, RenderCommand, RenderCommandResult, TrackedRenderPass,
+    },
    render_resource::{binding_types::uniform_buffer, *},
    renderer::{RenderDevice, RenderQueue},
    sync_world::{MainEntity, MainEntityHashMap},
@ -779,8 +781,8 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMesh2dBindGroup<I> {
    ) -> RenderCommandResult {
        let mut dynamic_offsets: [u32; 1] = Default::default();
        let mut offset_count = 0;
-        if let Some(dynamic_offset) = item.extra_index().as_dynamic_offset() {
-            dynamic_offsets[offset_count] = dynamic_offset.get();
+        if let PhaseItemExtraIndex::DynamicOffset(dynamic_offset) = item.extra_index() {
+            dynamic_offsets[offset_count] = dynamic_offset;
            offset_count += 1;
        }
        pass.set_bind_group(
--- a/crates/bevy_sprite/src/render/mod.rs
+++ b/crates/bevy_sprite/src/render/mod.rs
@ -577,7 +577,7 @@ pub fn queue_sprites(
                sort_key,
                // batch_range and dynamic_offset will be calculated in prepare_sprites
                batch_range: 0..0,
-                extra_index: PhaseItemExtraIndex::NONE,
+                extra_index: PhaseItemExtraIndex::None,
            });
        }
    }
--- a/crates/bevy_ui/src/render/box_shadow.rs
+++ b/crates/bevy_ui/src/render/box_shadow.rs
@ -377,7 +377,7 @@ pub fn queue_shadows(
                entity.index(),
            ),
            batch_range: 0..0,
-            extra_index: PhaseItemExtraIndex::NONE,
+            extra_index: PhaseItemExtraIndex::None,
        });
    }
 }
--- a/crates/bevy_ui/src/render/mod.rs
+++ b/crates/bevy_ui/src/render/mod.rs
@ -835,7 +835,7 @@ pub fn queue_uinodes(
            ),
            // batch_range will be calculated in prepare_uinodes
            batch_range: 0..0,
-            extra_index: PhaseItemExtraIndex::NONE,
+            extra_index: PhaseItemExtraIndex::None,
        });
    }
 }
--- a/crates/bevy_ui/src/render/render_pass.rs
+++ b/crates/bevy_ui/src/render/render_pass.rs
@ -126,7 +126,7 @@ impl PhaseItem for TransparentUi {

    #[inline]
    fn extra_index(&self) -> PhaseItemExtraIndex {
-        self.extra_index
+        self.extra_index.clone()
    }

    #[inline]
--- a/crates/bevy_ui/src/render/ui_material_pipeline.rs
+++ b/crates/bevy_ui/src/render/ui_material_pipeline.rs
@ -655,7 +655,7 @@ pub fn queue_ui_material_nodes<M: UiMaterial>(
                entity.index(),
            ),
            batch_range: 0..0,
-            extra_index: PhaseItemExtraIndex::NONE,
+            extra_index: PhaseItemExtraIndex::None,
        });
    }
 }
--- a/crates/bevy_ui/src/render/ui_texture_slice_pipeline.rs
+++ b/crates/bevy_ui/src/render/ui_texture_slice_pipeline.rs
@ -373,7 +373,7 @@ pub fn queue_ui_slices(
                entity.index(),
            ),
            batch_range: 0..0,
-            extra_index: PhaseItemExtraIndex::NONE,
+            extra_index: PhaseItemExtraIndex::None,
        });
    }
 }
--- a/examples/2d/mesh2d_manual.rs
+++ b/examples/2d/mesh2d_manual.rs
@ -410,7 +410,7 @@ pub fn queue_colored_mesh2d(
                    sort_key: FloatOrd(mesh_z),
                    // This material is not batched
                    batch_range: 0..1,
-                    extra_index: PhaseItemExtraIndex::NONE,
+                    extra_index: PhaseItemExtraIndex::None,
                });
            }
        }
--- a/examples/shader/custom_phase_item.rs
+++ b/examples/shader/custom_phase_item.rs
@ -8,7 +8,7 @@
 //! for better reuse of parts of Bevy's built-in mesh rendering logic.

 use bevy::{
-    core_pipeline::core_3d::{Opaque3d, Opaque3dBinKey, CORE_3D_DEPTH_FORMAT},
+    core_pipeline::core_3d::{Opaque3d, Opaque3dBatchSetKey, Opaque3dBinKey, CORE_3D_DEPTH_FORMAT},
    ecs::{
        query::ROQueryItem,
        system::{lifetimeless::SRes, SystemParamItem},
@ -270,11 +270,15 @@ fn queue_custom_phase_item(
            // not be the ID of a [`Mesh`].
            opaque_phase.add(
                Opaque3dBinKey {
-                    draw_function: draw_custom_phase_item,
-                    pipeline: pipeline_id,
+                    batch_set_key: Opaque3dBatchSetKey {
+                        draw_function: draw_custom_phase_item,
+                        pipeline: pipeline_id,
+                        material_bind_group_index: None,
+                        lightmap_image: None,
+                        vertex_slab: default(),
+                        index_slab: None,
+                    },
                    asset_id: AssetId::<Mesh>::invalid().untyped(),
-                    material_bind_group_index: None,
-                    lightmap_image: None,
                },
                entity,
                BinnedRenderPhaseType::NonMesh,
--- a/examples/shader/custom_shader_instancing.rs
+++ b/examples/shader/custom_shader_instancing.rs
@ -161,7 +161,7 @@ fn queue_custom(
                draw_function: draw_custom,
                distance: rangefinder.distance_translation(&mesh_instance.translation),
                batch_range: 0..1,
-                extra_index: PhaseItemExtraIndex::NONE,
+                extra_index: PhaseItemExtraIndex::None,
            });
        }
    }
--- a/examples/shader/specialized_mesh_pipeline.rs
+++ b/examples/shader/specialized_mesh_pipeline.rs
@ -7,7 +7,7 @@
 //! [`SpecializedMeshPipeline`] let's you customize the entire pipeline used when rendering a mesh.

 use bevy::{
-    core_pipeline::core_3d::{Opaque3d, Opaque3dBinKey, CORE_3D_DEPTH_FORMAT},
+    core_pipeline::core_3d::{Opaque3d, Opaque3dBatchSetKey, Opaque3dBinKey, CORE_3D_DEPTH_FORMAT},
    math::{vec3, vec4},
    pbr::{
        DrawMesh, MeshPipeline, MeshPipelineKey, MeshPipelineViewLayoutKey, RenderMeshInstances,
@ -335,14 +335,18 @@ fn queue_custom_mesh_pipeline(
            // Add the mesh with our specialized pipeline
            opaque_phase.add(
                Opaque3dBinKey {
-                    draw_function: draw_function_id,
-                    pipeline: pipeline_id,
+                    batch_set_key: Opaque3dBatchSetKey {
+                        draw_function: draw_function_id,
+                        pipeline: pipeline_id,
+                        material_bind_group_index: None,
+                        vertex_slab: default(),
+                        index_slab: None,
+                        lightmap_image: None,
+                    },
                    // The asset ID is arbitrary; we simply use [`AssetId::invalid`],
                    // but you can use anything you like. Note that the asset ID need
                    // not be the ID of a [`Mesh`].
                    asset_id: AssetId::<Mesh>::invalid().untyped(),
-                    material_bind_group_index: None,
-                    lightmap_image: None,
                },
                (render_entity, visible_entity),
                // This example supports batching, but if your pipeline doesn't