Batch skinned meshes on platforms where storage buffers are available. (#16599)

This commit makes skinned meshes batchable on platforms other than WebGL 2. On supported platforms, it replaces the two uniform buffers used for joint matrices with a pair of storage buffers containing all matrices for all skinned meshes packed together. The indices into the buffer are stored in the mesh uniform and mesh input uniform. The GPU mesh preprocessing step copies the indices in if that step is enabled. On the `many_foxes` demo, I observed a frame time decrease from 15.470ms to 11.935ms. This is the result of reducing the `submit_graph_commands` time from an average of 5.45ms to 0.489ms, an 11x speedup in that portion of rendering. ![Screenshot 2024-12-01 192838](https://github.com/user-attachments/assets/7d2db997-8939-466e-8b9e-050d4a6a78ee) This is what the profile looks like for `many_foxes` after these changes. ![Screenshot 2024-12-01 193026](https://github.com/user-attachments/assets/68983fc3-01b8-41fd-835e-3d93cb65d0fa) --------- Co-authored-by: François Mockers <mockersf@gmail.com>
2024-12-18 17:13:10 +00:00 · 2024-12-10 09:50:03 -08:00 · 2024-12-10 09:50:03 -08:00 · 3188e5af61
commit 3188e5af61
parent 7ed1f327d9
11 changed files with 236 additions and 72 deletions
--- a/crates/bevy_pbr/src/meshlet/instance_manager.rs
+++ b/crates/bevy_pbr/src/meshlet/instance_manager.rs
@ -120,7 +120,14 @@ impl InstanceManager {
            return;
        };

-        let mesh_uniform = MeshUniform::new(&transforms, 0, mesh_material_binding_id.slot, None);
+        let mesh_uniform = MeshUniform::new(
+            &transforms,
+            0,
+            mesh_material_binding_id.slot,
+            None,
+            None,
+            None,
+        );

        // Append instance data
        self.instances.push((
--- a/crates/bevy_pbr/src/prepass/mod.rs
+++ b/crates/bevy_pbr/src/prepass/mod.rs
@ -253,6 +253,11 @@ pub struct PrepassPipeline<M: Material> {
    pub deferred_material_vertex_shader: Option<Handle<Shader>>,
    pub deferred_material_fragment_shader: Option<Handle<Shader>>,
    pub material_pipeline: MaterialPipeline<M>,
+
+    /// Whether skins will use uniform buffers on account of storage buffers
+    /// being unavailable on this platform.
+    pub skins_use_uniform_buffers: bool,
+
    pub depth_clip_control_supported: bool,
    _marker: PhantomData<M>,
 }
@ -345,6 +350,7 @@ impl<M: Material> FromWorld for PrepassPipeline<M> {
            },
            material_layout: M::bind_group_layout(render_device),
            material_pipeline: world.resource::<MaterialPipeline<M>>().clone(),
+            skins_use_uniform_buffers: skin::skins_use_uniform_buffers(render_device),
            depth_clip_control_supported,
            _marker: PhantomData,
        }
@ -521,6 +527,7 @@ where
            &key.mesh_key,
            &mut shader_defs,
            &mut vertex_attributes,
+            self.skins_use_uniform_buffers,
        );
        bind_group_layouts.insert(1, bind_group);

--- a/crates/bevy_pbr/src/prepass/prepass.wgsl
+++ b/crates/bevy_pbr/src/prepass/prepass.wgsl
@ -69,7 +69,11 @@ fn vertex(vertex_no_morph: Vertex) -> VertexOutput {
    let mesh_world_from_local = mesh_functions::get_world_from_local(vertex_no_morph.instance_index);

 #ifdef SKINNED
-    var world_from_local = skinning::skin_model(vertex.joint_indices, vertex.joint_weights);
+    var world_from_local = skinning::skin_model(
+        vertex.joint_indices,
+        vertex.joint_weights,
+        vertex_no_morph.instance_index
+    );
 #else // SKINNED
    // Use vertex_no_morph.instance_index instead of vertex.instance_index to work around a wgpu dx12 bug.
    // See https://github.com/gfx-rs/naga/issues/2416
@ -142,6 +146,7 @@ fn vertex(vertex_no_morph: Vertex) -> VertexOutput {
    let prev_model = skinning::skin_prev_model(
        prev_vertex.joint_indices,
        prev_vertex.joint_weights,
+        vertex_no_morph.instance_index
    );
 #else   // HAS_PREVIOUS_SKIN
    let prev_model = mesh_functions::get_previous_world_from_local(prev_vertex.instance_index);
--- a/crates/bevy_pbr/src/render/mesh.rs
+++ b/crates/bevy_pbr/src/render/mesh.rs
@ -49,6 +49,7 @@ use bevy_utils::{
    HashMap, Parallel,
 };
 use material_bind_groups::MaterialBindingId;
+use render::skin::{self, SkinIndex};

 use crate::{
    render::{
@ -152,7 +153,6 @@ impl Plugin for MeshRenderPlugin {
        if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
            render_app
                .init_resource::<MeshBindGroups>()
-                .init_resource::<SkinUniforms>()
                .init_resource::<SkinIndices>()
                .init_resource::<MorphUniforms>()
                .init_resource::<MorphIndices>()
@ -189,7 +189,9 @@ impl Plugin for MeshRenderPlugin {
        let mut mesh_bindings_shader_defs = Vec::with_capacity(1);

        if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
-            render_app.init_resource::<GpuPreprocessingSupport>();
+            render_app
+                .init_resource::<GpuPreprocessingSupport>()
+                .init_resource::<SkinUniforms>();

            let gpu_preprocessing_support =
                render_app.world().resource::<GpuPreprocessingSupport>();
@ -220,6 +222,7 @@ impl Plugin for MeshRenderPlugin {
                            collect_meshes_for_gpu_building
                                .in_set(RenderSet::PrepareAssets)
                                .after(allocator::allocate_and_free_meshes)
+                                .after(extract_skins)
                                // This must be before
                                // `set_mesh_motion_vector_flags` so it doesn't
                                // overwrite those flags.
@ -307,12 +310,12 @@ pub struct MeshUniform {
    /// [`MeshAllocator`]). This value stores the offset of the first vertex in
    /// this mesh in that buffer.
    pub first_vertex_index: u32,
+    /// The current skin index, or `u32::MAX` if there's no skin.
+    pub current_skin_index: u32,
+    /// The previous skin index, or `u32::MAX` if there's no previous skin.
+    pub previous_skin_index: u32,
    /// Index of the material inside the bind group data.
    pub material_bind_group_slot: u32,
-    /// Padding.
-    pub pad_a: u32,
-    /// Padding.
-    pub pad_b: u32,
 }

 /// Information that has to be transferred from CPU to GPU in order to produce
@ -349,12 +352,12 @@ pub struct MeshInputUniform {
    /// [`MeshAllocator`]). This value stores the offset of the first vertex in
    /// this mesh in that buffer.
    pub first_vertex_index: u32,
+    /// The current skin index, or `u32::MAX` if there's no skin.
+    pub current_skin_index: u32,
+    /// The previous skin index, or `u32::MAX` if there's no previous skin.
+    pub previous_skin_index: u32,
    /// Index of the material inside the bind group data.
    pub material_bind_group_slot: u32,
-    /// Padding.
-    pub pad_a: u32,
-    /// Padding.
-    pub pad_b: u32,
 }

 /// Information about each mesh instance needed to cull it on GPU.
@ -386,6 +389,8 @@ impl MeshUniform {
        first_vertex_index: u32,
        material_bind_group_slot: MaterialBindGroupSlot,
        maybe_lightmap_uv_rect: Option<Rect>,
+        current_skin_index: Option<u32>,
+        previous_skin_index: Option<u32>,
    ) -> Self {
        let (local_from_world_transpose_a, local_from_world_transpose_b) =
            mesh_transforms.world_from_local.inverse_transpose_3x3();
@ -397,9 +402,9 @@ impl MeshUniform {
            local_from_world_transpose_b,
            flags: mesh_transforms.flags,
            first_vertex_index,
+            current_skin_index: current_skin_index.unwrap_or(u32::MAX),
+            previous_skin_index: previous_skin_index.unwrap_or(u32::MAX),
            material_bind_group_slot: *material_bind_group_slot,
-            pad_a: 0,
-            pad_b: 0,
        }
    }
 }
@ -880,6 +885,7 @@ impl RenderMeshInstanceGpuBuilder {
        current_input_buffer: &mut InstanceInputUniformBuffer<MeshInputUniform>,
        previous_input_buffer: &mut InstanceInputUniformBuffer<MeshInputUniform>,
        mesh_allocator: &MeshAllocator,
+        skin_indices: &SkinIndices,
    ) -> u32 {
        let first_vertex_index = match mesh_allocator.mesh_vertex_slice(&self.shared.mesh_asset_id)
        {
@ -887,6 +893,15 @@ impl RenderMeshInstanceGpuBuilder {
            None => 0,
        };

+        let current_skin_index = match skin_indices.current.get(&entity) {
+            Some(skin_indices) => skin_indices.index(),
+            None => u32::MAX,
+        };
+        let previous_skin_index = match skin_indices.prev.get(&entity) {
+            Some(skin_indices) => skin_indices.index(),
+            None => u32::MAX,
+        };
+
        // Create the mesh input uniform.
        let mut mesh_input_uniform = MeshInputUniform {
            world_from_local: self.world_from_local.to_transpose(),
@ -894,9 +909,9 @@ impl RenderMeshInstanceGpuBuilder {
            flags: self.mesh_flags.bits(),
            previous_input_index: u32::MAX,
            first_vertex_index,
+            current_skin_index,
+            previous_skin_index,
            material_bind_group_slot: *self.shared.material_bindings_index.slot,
-            pad_a: 0,
-            pad_b: 0,
        };

        // Did the last frame contain this entity as well?
@ -1312,6 +1327,7 @@ pub fn collect_meshes_for_gpu_building(
    mut mesh_culling_data_buffer: ResMut<MeshCullingDataBuffer>,
    mut render_mesh_instance_queues: ResMut<RenderMeshInstanceGpuQueues>,
    mesh_allocator: Res<MeshAllocator>,
+    skin_indices: Res<SkinIndices>,
 ) {
    let RenderMeshInstances::GpuBuilding(ref mut render_mesh_instances) =
        render_mesh_instances.into_inner()
@ -1347,6 +1363,7 @@ pub fn collect_meshes_for_gpu_building(
                        current_input_buffer,
                        previous_input_buffer,
                        &mesh_allocator,
+                        &skin_indices,
                    );
                }

@ -1370,6 +1387,7 @@ pub fn collect_meshes_for_gpu_building(
                        current_input_buffer,
                        previous_input_buffer,
                        &mesh_allocator,
+                        &skin_indices,
                    );
                    mesh_culling_builder
                        .update(&mut mesh_culling_data_buffer, instance_data_index as usize);
@ -1417,6 +1435,10 @@ pub struct MeshPipeline {
    ///
    /// This affects whether reflection probes can be used.
    pub binding_arrays_are_usable: bool,
+
+    /// Whether skins will use uniform buffers on account of storage buffers
+    /// being unavailable on this platform.
+    pub skins_use_uniform_buffers: bool,
 }

 impl FromWorld for MeshPipeline {
@ -1474,6 +1496,7 @@ impl FromWorld for MeshPipeline {
            mesh_layouts: MeshLayouts::new(&render_device),
            per_object_buffer_batch_size: GpuArrayBuffer::<MeshUniform>::batch_size(&render_device),
            binding_arrays_are_usable: binding_arrays_are_usable(&render_device),
+            skins_use_uniform_buffers: skin::skins_use_uniform_buffers(&render_device),
        }
    }
 }
@ -1506,6 +1529,7 @@ impl GetBatchData for MeshPipeline {
        SRes<RenderLightmaps>,
        SRes<RenderAssets<RenderMesh>>,
        SRes<MeshAllocator>,
+        SRes<SkinIndices>,
    );
    // The material bind group ID, the mesh ID, and the lightmap ID,
    // respectively.
@ -1518,7 +1542,7 @@ impl GetBatchData for MeshPipeline {
    type BufferData = MeshUniform;

    fn get_batch_data(
-        (mesh_instances, lightmaps, _, mesh_allocator): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _, mesh_allocator, skin_indices): &SystemParamItem<Self::Param>,
        (_entity, main_entity): (Entity, MainEntity),
    ) -> Option<(Self::BufferData, Option<Self::CompareData>)> {
        let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
@ -1536,6 +1560,9 @@ impl GetBatchData for MeshPipeline {
            };
        let maybe_lightmap = lightmaps.render_lightmaps.get(&main_entity);

+        let current_skin_index = skin_indices.current.get(&main_entity).map(SkinIndex::index);
+        let previous_skin_index = skin_indices.prev.get(&main_entity).map(SkinIndex::index);
+
        let material_bind_group_index = mesh_instance.material_bindings_index;

        Some((
@ -1544,6 +1571,8 @@ impl GetBatchData for MeshPipeline {
                first_vertex_index,
                material_bind_group_index.slot,
                maybe_lightmap.map(|lightmap| lightmap.uv_rect),
+                current_skin_index,
+                previous_skin_index,
            ),
            mesh_instance.should_batch().then_some((
                material_bind_group_index.group,
@ -1558,7 +1587,7 @@ impl GetFullBatchData for MeshPipeline {
    type BufferInputData = MeshInputUniform;

    fn get_index_and_compare_data(
-        (mesh_instances, lightmaps, _, _): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _, _, _): &SystemParamItem<Self::Param>,
        (_entity, main_entity): (Entity, MainEntity),
    ) -> Option<(NonMaxU32, Option<Self::CompareData>)> {
        // This should only be called during GPU building.
@ -1584,7 +1613,7 @@ impl GetFullBatchData for MeshPipeline {
    }

    fn get_binned_batch_data(
-        (mesh_instances, lightmaps, _, mesh_allocator): &SystemParamItem<Self::Param>,
+        (mesh_instances, lightmaps, _, mesh_allocator, skin_indices): &SystemParamItem<Self::Param>,
        (_entity, main_entity): (Entity, MainEntity),
    ) -> Option<Self::BufferData> {
        let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
@ -1601,16 +1630,21 @@ impl GetFullBatchData for MeshPipeline {
            };
        let maybe_lightmap = lightmaps.render_lightmaps.get(&main_entity);

+        let current_skin_index = skin_indices.current.get(&main_entity).map(SkinIndex::index);
+        let previous_skin_index = skin_indices.prev.get(&main_entity).map(SkinIndex::index);
+
        Some(MeshUniform::new(
            &mesh_instance.transforms,
            first_vertex_index,
            mesh_instance.material_bindings_index.slot,
            maybe_lightmap.map(|lightmap| lightmap.uv_rect),
+            current_skin_index,
+            previous_skin_index,
        ))
    }

    fn get_binned_index(
-        (mesh_instances, _, _, _): &SystemParamItem<Self::Param>,
+        (mesh_instances, _, _, _, _): &SystemParamItem<Self::Param>,
        (_entity, main_entity): (Entity, MainEntity),
    ) -> Option<NonMaxU32> {
        // This should only be called during GPU building.
@ -1628,7 +1662,7 @@ impl GetFullBatchData for MeshPipeline {
    }

    fn get_batch_indirect_parameters_index(
-        (mesh_instances, _, meshes, mesh_allocator): &SystemParamItem<Self::Param>,
+        (mesh_instances, _, meshes, mesh_allocator, _): &SystemParamItem<Self::Param>,
        indirect_parameters_buffer: &mut IndirectParametersBuffer,
        entity: (Entity, MainEntity),
        instance_index: u32,
@ -1868,15 +1902,22 @@ pub fn setup_morph_and_skinning_defs(
    key: &MeshPipelineKey,
    shader_defs: &mut Vec<ShaderDefVal>,
    vertex_attributes: &mut Vec<VertexAttributeDescriptor>,
+    skins_use_uniform_buffers: bool,
 ) -> BindGroupLayout {
+    let is_morphed = key.intersects(MeshPipelineKey::MORPH_TARGETS);
+    let is_lightmapped = key.intersects(MeshPipelineKey::LIGHTMAPPED);
+    let motion_vector_prepass = key.intersects(MeshPipelineKey::MOTION_VECTOR_PREPASS);
+
+    if skins_use_uniform_buffers {
+        shader_defs.push("SKINS_USE_UNIFORM_BUFFERS".into());
+    }
+
    let mut add_skin_data = || {
        shader_defs.push("SKINNED".into());
        vertex_attributes.push(Mesh::ATTRIBUTE_JOINT_INDEX.at_shader_location(offset));
        vertex_attributes.push(Mesh::ATTRIBUTE_JOINT_WEIGHT.at_shader_location(offset + 1));
    };
-    let is_morphed = key.intersects(MeshPipelineKey::MORPH_TARGETS);
-    let is_lightmapped = key.intersects(MeshPipelineKey::LIGHTMAPPED);
-    let motion_vector_prepass = key.intersects(MeshPipelineKey::MOTION_VECTOR_PREPASS);
+
    match (
        is_skinned(layout),
        is_morphed,
@ -1985,6 +2026,7 @@ impl SpecializedMeshPipeline for MeshPipeline {
            &key,
            &mut shader_defs,
            &mut vertex_attributes,
+            self.skins_use_uniform_buffers,
        ));

        if key.contains(MeshPipelineKey::SCREEN_SPACE_AMBIENT_OCCLUSION) {
@ -2477,6 +2519,7 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshViewBindGroup<I>
 pub struct SetMeshBindGroup<const I: usize>;
 impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {
    type Param = (
+        SRes<RenderDevice>,
        SRes<MeshBindGroups>,
        SRes<RenderMeshInstances>,
        SRes<SkinIndices>,
@ -2491,11 +2534,14 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {
        item: &P,
        has_motion_vector_prepass: bool,
        _item_query: Option<()>,
-        (bind_groups, mesh_instances, skin_indices, morph_indices, lightmaps): SystemParamItem<
-            'w,
-            '_,
-            Self::Param,
-        >,
+        (
+            render_device,
+            bind_groups,
+            mesh_instances,
+            skin_indices,
+            morph_indices,
+            lightmaps,
+        ): SystemParamItem<'w, '_, Self::Param>,
        pass: &mut TrackedRenderPass<'w>,
    ) -> RenderCommandResult {
        let bind_groups = bind_groups.into_inner();
@ -2508,6 +2554,7 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {
        let Some(mesh_asset_id) = mesh_instances.mesh_asset_id(*entity) else {
            return RenderCommandResult::Success;
        };
+
        let current_skin_index = skin_indices.current.get(entity);
        let prev_skin_index = skin_indices.prev.get(entity);
        let current_morph_index = morph_indices.current.get(entity);
@ -2542,8 +2589,10 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {
            offset_count += 1;
        }
        if let Some(current_skin_index) = current_skin_index {
-            dynamic_offsets[offset_count] = current_skin_index.index;
-            offset_count += 1;
+            if skin::skins_use_uniform_buffers(&render_device) {
+                dynamic_offsets[offset_count] = current_skin_index.byte_offset;
+                offset_count += 1;
+            }
        }
        if let Some(current_morph_index) = current_morph_index {
            dynamic_offsets[offset_count] = current_morph_index.index;
@ -2554,9 +2603,11 @@ impl<P: PhaseItem, const I: usize> RenderCommand<P> for SetMeshBindGroup<I> {
        if has_motion_vector_prepass {
            // Attach the previous skin index for motion vector computation. If
            // there isn't one, just use zero as the shader will ignore it.
-            if current_skin_index.is_some() {
+            if current_skin_index.is_some() && skin::skins_use_uniform_buffers(&render_device) {
                match prev_skin_index {
-                    Some(prev_skin_index) => dynamic_offsets[offset_count] = prev_skin_index.index,
+                    Some(prev_skin_index) => {
+                        dynamic_offsets[offset_count] = prev_skin_index.byte_offset;
+                    }
                    None => dynamic_offsets[offset_count] = 0,
                }
                offset_count += 1;
--- a/crates/bevy_pbr/src/render/mesh.wgsl
+++ b/crates/bevy_pbr/src/render/mesh.wgsl
@ -44,7 +44,11 @@ fn vertex(vertex_no_morph: Vertex) -> VertexOutput {
    let mesh_world_from_local = mesh_functions::get_world_from_local(vertex_no_morph.instance_index);

 #ifdef SKINNED
-    var world_from_local = skinning::skin_model(vertex.joint_indices, vertex.joint_weights);
+    var world_from_local = skinning::skin_model(
+        vertex.joint_indices,
+        vertex.joint_weights,
+        vertex_no_morph.instance_index
+    );
 #else
    // Use vertex_no_morph.instance_index instead of vertex.instance_index to work around a wgpu dx12 bug.
    // See https://github.com/gfx-rs/naga/issues/2416 .
--- a/crates/bevy_pbr/src/render/mesh_bindings.rs
+++ b/crates/bevy_pbr/src/render/mesh_bindings.rs
@ -22,10 +22,13 @@ pub(crate) const JOINT_BUFFER_SIZE: usize = MAX_JOINTS * JOINT_SIZE;
 /// Individual layout entries.
 mod layout_entry {
    use super::{JOINT_BUFFER_SIZE, MORPH_BUFFER_SIZE};
-    use crate::MeshUniform;
+    use crate::{render::skin, MeshUniform};
    use bevy_render::{
        render_resource::{
-            binding_types::{sampler, texture_2d, texture_3d, uniform_buffer_sized},
+            binding_types::{
+                sampler, storage_buffer_read_only_sized, texture_2d, texture_3d,
+                uniform_buffer_sized,
+            },
            BindGroupLayoutEntryBuilder, BufferSize, GpuArrayBuffer, SamplerBindingType,
            ShaderStages, TextureSampleType,
        },
@ -36,8 +39,15 @@ mod layout_entry {
        GpuArrayBuffer::<MeshUniform>::binding_layout(render_device)
            .visibility(ShaderStages::VERTEX_FRAGMENT)
    }
-    pub(super) fn skinning() -> BindGroupLayoutEntryBuilder {
-        uniform_buffer_sized(true, BufferSize::new(JOINT_BUFFER_SIZE as u64))
+    pub(super) fn skinning(render_device: &RenderDevice) -> BindGroupLayoutEntryBuilder {
+        // If we can use storage buffers, do so. Otherwise, fall back to uniform
+        // buffers.
+        let size = BufferSize::new(JOINT_BUFFER_SIZE as u64);
+        if skin::skins_use_uniform_buffers(render_device) {
+            uniform_buffer_sized(true, size)
+        } else {
+            storage_buffer_read_only_sized(false, size)
+        }
    }
    pub(super) fn weights() -> BindGroupLayoutEntryBuilder {
        uniform_buffer_sized(true, BufferSize::new(MORPH_BUFFER_SIZE as u64))
@ -56,29 +66,44 @@ mod layout_entry {
 /// Individual [`BindGroupEntry`]
 /// for bind groups.
 mod entry {
+    use crate::render::skin;
+
    use super::{JOINT_BUFFER_SIZE, MORPH_BUFFER_SIZE};
-    use bevy_render::render_resource::{
-        BindGroupEntry, BindingResource, Buffer, BufferBinding, BufferSize, Sampler, TextureView,
+    use bevy_render::{
+        render_resource::{
+            BindGroupEntry, BindingResource, Buffer, BufferBinding, BufferSize, Sampler,
+            TextureView,
+        },
+        renderer::RenderDevice,
    };

-    fn entry(binding: u32, size: u64, buffer: &Buffer) -> BindGroupEntry {
+    fn entry(binding: u32, size: Option<u64>, buffer: &Buffer) -> BindGroupEntry {
        BindGroupEntry {
            binding,
            resource: BindingResource::Buffer(BufferBinding {
                buffer,
                offset: 0,
-                size: Some(BufferSize::new(size).unwrap()),
+                size: size.map(|size| BufferSize::new(size).unwrap()),
            }),
        }
    }
    pub(super) fn model(binding: u32, resource: BindingResource) -> BindGroupEntry {
        BindGroupEntry { binding, resource }
    }
-    pub(super) fn skinning(binding: u32, buffer: &Buffer) -> BindGroupEntry {
-        entry(binding, JOINT_BUFFER_SIZE as u64, buffer)
+    pub(super) fn skinning<'a>(
+        render_device: &RenderDevice,
+        binding: u32,
+        buffer: &'a Buffer,
+    ) -> BindGroupEntry<'a> {
+        let size = if skin::skins_use_uniform_buffers(render_device) {
+            Some(JOINT_BUFFER_SIZE as u64)
+        } else {
+            None
+        };
+        entry(binding, size, buffer)
    }
    pub(super) fn weights(binding: u32, buffer: &Buffer) -> BindGroupEntry {
-        entry(binding, MORPH_BUFFER_SIZE as u64, buffer)
+        entry(binding, Some(MORPH_BUFFER_SIZE as u64), buffer)
    }
    pub(super) fn targets(binding: u32, texture: &TextureView) -> BindGroupEntry {
        BindGroupEntry {
@ -175,7 +200,7 @@ impl MeshLayouts {
                (
                    (0, layout_entry::model(render_device)),
                    // The current frame's joint matrix buffer.
-                    (1, layout_entry::skinning()),
+                    (1, layout_entry::skinning(render_device)),
                ),
            ),
        )
@ -191,9 +216,9 @@ impl MeshLayouts {
                (
                    (0, layout_entry::model(render_device)),
                    // The current frame's joint matrix buffer.
-                    (1, layout_entry::skinning()),
+                    (1, layout_entry::skinning(render_device)),
                    // The previous frame's joint matrix buffer.
-                    (6, layout_entry::skinning()),
+                    (6, layout_entry::skinning(render_device)),
                ),
            ),
        )
@ -244,7 +269,7 @@ impl MeshLayouts {
                (
                    (0, layout_entry::model(render_device)),
                    // The current frame's joint matrix buffer.
-                    (1, layout_entry::skinning()),
+                    (1, layout_entry::skinning(render_device)),
                    // The current frame's morph weight buffer.
                    (2, layout_entry::weights()),
                    (3, layout_entry::targets()),
@ -263,12 +288,12 @@ impl MeshLayouts {
                (
                    (0, layout_entry::model(render_device)),
                    // The current frame's joint matrix buffer.
-                    (1, layout_entry::skinning()),
+                    (1, layout_entry::skinning(render_device)),
                    // The current frame's morph weight buffer.
                    (2, layout_entry::weights()),
                    (3, layout_entry::targets()),
                    // The previous frame's joint matrix buffer.
-                    (6, layout_entry::skinning()),
+                    (6, layout_entry::skinning(render_device)),
                    // The previous frame's morph weight buffer.
                    (7, layout_entry::weights()),
                ),
@ -329,7 +354,7 @@ impl MeshLayouts {
            &self.skinned,
            &[
                entry::model(0, model.clone()),
-                entry::skinning(1, current_skin),
+                entry::skinning(render_device, 1, current_skin),
            ],
        )
    }
@ -353,8 +378,8 @@ impl MeshLayouts {
            &self.skinned_motion,
            &[
                entry::model(0, model.clone()),
-                entry::skinning(1, current_skin),
-                entry::skinning(6, prev_skin),
+                entry::skinning(render_device, 1, current_skin),
+                entry::skinning(render_device, 6, prev_skin),
            ],
        )
    }
@ -420,7 +445,7 @@ impl MeshLayouts {
            &self.morphed_skinned,
            &[
                entry::model(0, model.clone()),
-                entry::skinning(1, current_skin),
+                entry::skinning(render_device, 1, current_skin),
                entry::weights(2, current_weights),
                entry::targets(3, targets),
            ],
@ -450,10 +475,10 @@ impl MeshLayouts {
            &self.morphed_skinned_motion,
            &[
                entry::model(0, model.clone()),
-                entry::skinning(1, current_skin),
+                entry::skinning(render_device, 1, current_skin),
                entry::weights(2, current_weights),
                entry::targets(3, targets),
-                entry::skinning(6, prev_skin),
+                entry::skinning(render_device, 6, prev_skin),
                entry::weights(7, prev_weights),
            ],
        )
--- a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
+++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl
@ -23,10 +23,10 @@ struct MeshInput {
    // applicable. If not present, this is `u32::MAX`.
    previous_input_index: u32,
    first_vertex_index: u32,
+    current_skin_index: u32,
+    previous_skin_index: u32,
    // Index of the material inside the bind group data.
    material_bind_group_slot: u32,
-    pad_a: u32,
-    pad_b: u32,
 }

 // Information about each mesh instance needed to cull it on GPU.
@ -192,6 +192,8 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
    output[mesh_output_index].flags = current_input[input_index].flags;
    output[mesh_output_index].lightmap_uv_rect = current_input[input_index].lightmap_uv_rect;
    output[mesh_output_index].first_vertex_index = current_input[input_index].first_vertex_index;
+    output[mesh_output_index].current_skin_index = current_input[input_index].current_skin_index;
+    output[mesh_output_index].previous_skin_index = current_input[input_index].previous_skin_index;
    output[mesh_output_index].material_bind_group_slot =
        current_input[input_index].material_bind_group_slot;
 }
--- a/crates/bevy_pbr/src/render/mesh_types.wgsl
+++ b/crates/bevy_pbr/src/render/mesh_types.wgsl
@ -17,10 +17,10 @@ struct Mesh {
    lightmap_uv_rect: vec2<u32>,
    // The index of the mesh's first vertex in the vertex buffer.
    first_vertex_index: u32,
+    current_skin_index: u32,
+    previous_skin_index: u32,
    // Index of the material inside the bind group data.
    material_bind_group_slot: u32,
-    pad_a: u32,
-    pad_b: u32,
 };

 #ifdef SKINNED
--- a/crates/bevy_pbr/src/render/mod.rs
+++ b/crates/bevy_pbr/src/render/mod.rs
@ -5,7 +5,7 @@ pub(crate) mod mesh;
 mod mesh_bindings;
 mod mesh_view_bindings;
 mod morph;
-mod skin;
+pub(crate) mod skin;

 pub use fog::*;
 pub use gpu_preprocess::*;
--- a/crates/bevy_pbr/src/render/skin.rs
+++ b/crates/bevy_pbr/src/render/skin.rs
@ -1,4 +1,5 @@
 use core::mem::{self, size_of};
+use std::sync::OnceLock;

 use bevy_asset::Assets;
 use bevy_ecs::prelude::*;
@ -23,18 +24,27 @@ use bevy_transform::prelude::GlobalTransform;
 /// of the GPU at runtime, which would mean not using consts anymore.
 pub const MAX_JOINTS: usize = 256;

+/// The location of the first joint matrix in the skin uniform buffer.
 #[derive(Component)]
 pub struct SkinIndex {
-    pub index: u32,
+    /// The byte offset of the first joint matrix.
+    pub byte_offset: u32,
 }

 impl SkinIndex {
    /// Index to be in address space based on the size of a skin uniform.
    const fn new(start: usize) -> Self {
        SkinIndex {
-            index: (start * size_of::<Mat4>()) as u32,
+            byte_offset: (start * size_of::<Mat4>()) as u32,
        }
    }
+
+    /// Returns this skin index in elements (not bytes).
+    ///
+    /// Each element is a 4x4 matrix.
+    pub fn index(&self) -> u32 {
+        self.byte_offset / size_of::<Mat4>() as u32
+    }
 }

 /// Maps each skinned mesh to the applicable offset within the [`SkinUniforms`]
@ -70,15 +80,30 @@ pub struct SkinUniforms {
    pub prev_buffer: RawBufferVec<Mat4>,
 }

-impl Default for SkinUniforms {
-    fn default() -> Self {
+impl FromWorld for SkinUniforms {
+    fn from_world(world: &mut World) -> Self {
+        let device = world.resource::<RenderDevice>();
+        let buffer_usages = if skins_use_uniform_buffers(device) {
+            BufferUsages::UNIFORM
+        } else {
+            BufferUsages::STORAGE
+        };
+
        Self {
-            current_buffer: RawBufferVec::new(BufferUsages::UNIFORM),
-            prev_buffer: RawBufferVec::new(BufferUsages::UNIFORM),
+            current_buffer: RawBufferVec::new(buffer_usages),
+            prev_buffer: RawBufferVec::new(buffer_usages),
        }
    }
 }

+/// Returns true if skinning must use uniforms (and dynamic offsets) because
+/// storage buffers aren't supported on the current platform.
+pub fn skins_use_uniform_buffers(render_device: &RenderDevice) -> bool {
+    static SKINS_USE_UNIFORM_BUFFERS: OnceLock<bool> = OnceLock::new();
+    *SKINS_USE_UNIFORM_BUFFERS
+        .get_or_init(|| render_device.limits().max_storage_buffers_per_shader_stage == 0)
+}
+
 pub fn prepare_skins(
    render_device: Res<RenderDevice>,
    render_queue: Res<RenderQueue>,
@ -130,7 +155,10 @@ pub fn extract_skins(
    query: Extract<Query<(Entity, &ViewVisibility, &SkinnedMesh)>>,
    inverse_bindposes: Extract<Res<Assets<SkinnedMeshInverseBindposes>>>,
    joints: Extract<Query<&GlobalTransform>>,
+    render_device: Res<RenderDevice>,
 ) {
+    let skins_use_uniform_buffers = skins_use_uniform_buffers(&render_device);
+
    // Borrow check workaround.
    let (skin_indices, uniform) = (skin_indices.into_inner(), uniform.into_inner());

@ -170,9 +198,12 @@ pub fn extract_skins(
        }
        last_start = last_start.max(start);

-        // Pad to 256 byte alignment
-        while buffer.len() % 4 != 0 {
-            buffer.push(Mat4::ZERO);
+        // Pad to 256 byte alignment if we're using a uniform buffer.
+        // There's no need to do this if we're using storage buffers, though.
+        if skins_use_uniform_buffers {
+            while buffer.len() % 4 != 0 {
+                buffer.push(Mat4::ZERO);
+            }
        }

        skin_indices
@ -187,11 +218,16 @@ pub fn extract_skins(
 }

 // NOTE: The skinned joints uniform buffer has to be bound at a dynamic offset per
-// entity and so cannot currently be batched.
+// entity and so cannot currently be batched on WebGL 2.
 pub fn no_automatic_skin_batching(
    mut commands: Commands,
    query: Query<Entity, (With<SkinnedMesh>, Without<NoAutomaticBatching>)>,
+    render_device: Res<RenderDevice>,
 ) {
+    if !skins_use_uniform_buffers(&render_device) {
+        return;
+    }
+
    for entity in &query {
        commands.entity(entity).try_insert(NoAutomaticBatching);
    }
--- a/crates/bevy_pbr/src/render/skinning.wgsl
+++ b/crates/bevy_pbr/src/render/skinning.wgsl
@ -1,10 +1,15 @@
 #define_import_path bevy_pbr::skinning

 #import bevy_pbr::mesh_types::SkinnedMesh
+#import bevy_pbr::mesh_bindings::mesh

 #ifdef SKINNED

+#ifdef SKINS_USE_UNIFORM_BUFFERS
@group(1) @binding(1) var<uniform> joint_matrices: SkinnedMesh;
+#else   // SKINS_USE_UNIFORM_BUFFERS
+@group(1) @binding(1) var<storage> joint_matrices: array<mat4x4<f32>>;
+#endif  // SKINS_USE_UNIFORM_BUFFERS

 // An array of matrices specifying the joint positions from the previous frame.
 //
@ -12,16 +17,29 @@
 //
 // If this is the first frame, or we're otherwise prevented from using data from
 // the previous frame, this is simply the same as `joint_matrices` above.
+#ifdef SKINS_USE_UNIFORM_BUFFERS
@group(1) @binding(6) var<uniform> prev_joint_matrices: SkinnedMesh;
+#else   // SKINS_USE_UNIFORM_BUFFERS
+@group(1) @binding(6) var<storage> prev_joint_matrices: array<mat4x4<f32>>;
+#endif  // SKINS_USE_UNIFORM_BUFFERS

 fn skin_model(
    indexes: vec4<u32>,
    weights: vec4<f32>,
+    instance_index: u32,
 ) -> mat4x4<f32> {
+#ifdef SKINS_USE_UNIFORM_BUFFERS
    return weights.x * joint_matrices.data[indexes.x]
        + weights.y * joint_matrices.data[indexes.y]
        + weights.z * joint_matrices.data[indexes.z]
        + weights.w * joint_matrices.data[indexes.w];
+#else   // SKINS_USE_UNIFORM_BUFFERS
+    let skin_index = mesh[instance_index].current_skin_index;
+    return weights.x * joint_matrices[skin_index + indexes.x]
+        + weights.y * joint_matrices[skin_index + indexes.y]
+        + weights.z * joint_matrices[skin_index + indexes.z]
+        + weights.w * joint_matrices[skin_index + indexes.w];
+#endif  // SKINS_USE_UNIFORM_BUFFERS
 }

 // Returns the skinned position of a vertex with the given weights from the
@ -31,11 +49,20 @@ fn skin_model(
 fn skin_prev_model(
    indexes: vec4<u32>,
    weights: vec4<f32>,
+    instance_index: u32,
 ) -> mat4x4<f32> {
+#ifdef SKINS_USE_UNIFORM_BUFFERS
    return weights.x * prev_joint_matrices.data[indexes.x]
        + weights.y * prev_joint_matrices.data[indexes.y]
        + weights.z * prev_joint_matrices.data[indexes.z]
        + weights.w * prev_joint_matrices.data[indexes.w];
+#else   // SKINS_USE_UNIFORM_BUFFERS
+    let skin_index = mesh[instance_index].previous_skin_index;
+    return weights.x * prev_joint_matrices[skin_index + indexes.x]
+        + weights.y * prev_joint_matrices[skin_index + indexes.y]
+        + weights.z * prev_joint_matrices[skin_index + indexes.z]
+        + weights.w * prev_joint_matrices[skin_index + indexes.w];
+#endif  // SKINS_USE_UNIFORM_BUFFERS
 }

 fn inverse_transpose_3x3m(in: mat3x3<f32>) -> mat3x3<f32> {