From d235d41af1fad553d3026c37034c8b6c3898476e Mon Sep 17 00:00:00 2001 From: Patrick Walton Date: Mon, 22 Jul 2024 12:55:51 -0600 Subject: [PATCH] Fix the example regressions from packed growable buffers. (#14375) The "uberbuffers" PR #14257 caused some examples to fail intermittently for different reasons: 1. `morph_targets` could fail because vertex displacements for morph targets are keyed off the vertex index. With buffer packing, the vertex index can vary based on the position in the buffer, which caused the morph targets to be potentially incorrect. The solution is to include the first vertex index with the `MeshUniform` (and `MeshInputUniform` if GPU preprocessing is in use), so that the shader can calculate the true vertex index before performing the morph operation. This results in wasted space in `MeshUniform`, which is unfortunate, but we'll soon be filling in the padding with the ID of the material when bindless textures land, so this had to happen sooner or later anyhow. Including the vertex index in the `MeshInputUniform` caused an ordering problem. The `MeshInputUniform` was created during the extraction phase, before the allocations occurred, so the extraction logic didn't know where the mesh vertex data was going to end up. The solution is to move the `MeshInputUniform` creation (the `collect_meshes_for_gpu_building` system) to after the allocations phase. This should be better for parallelism anyhow, because it allows the extraction phase to finish quicker. It's also something we'll have to do for bindless in any event. 2. The `lines` and `fog_volumes` examples could fail because their custom drawing nodes weren't updated to supply the vertex and index offsets in their `draw_indexed` and `draw` calls. This commit fixes this oversight. Fixes #14366. --- crates/bevy_pbr/src/meshlet/gpu_scene.rs | 2 +- crates/bevy_pbr/src/prepass/prepass.wgsl | 10 +- crates/bevy_pbr/src/render/mesh.rs | 122 +++++++++++++----- crates/bevy_pbr/src/render/mesh.wgsl | 10 +- .../bevy_pbr/src/render/mesh_preprocess.wgsl | 5 + crates/bevy_pbr/src/render/mesh_types.wgsl | 5 + crates/bevy_pbr/src/volumetric_fog/render.rs | 8 +- 7 files changed, 124 insertions(+), 38 deletions(-) diff --git a/crates/bevy_pbr/src/meshlet/gpu_scene.rs b/crates/bevy_pbr/src/meshlet/gpu_scene.rs index f13b44808d..1d4bf7ffe6 100644 --- a/crates/bevy_pbr/src/meshlet/gpu_scene.rs +++ b/crates/bevy_pbr/src/meshlet/gpu_scene.rs @@ -138,7 +138,7 @@ pub fn extract_meshlet_meshes( gpu_scene .instance_uniforms .get_mut() - .push(MeshUniform::new(&transforms, None)); + .push(MeshUniform::new(&transforms, 0, None)); } } diff --git a/crates/bevy_pbr/src/prepass/prepass.wgsl b/crates/bevy_pbr/src/prepass/prepass.wgsl index cb1328fae1..7a0fcb89ed 100644 --- a/crates/bevy_pbr/src/prepass/prepass.wgsl +++ b/crates/bevy_pbr/src/prepass/prepass.wgsl @@ -1,5 +1,6 @@ #import bevy_pbr::{ prepass_bindings, + mesh_bindings::mesh, mesh_functions, prepass_io::{Vertex, VertexOutput, FragmentOutput}, skinning, @@ -15,18 +16,21 @@ #ifdef MORPH_TARGETS fn morph_vertex(vertex_in: Vertex) -> Vertex { var vertex = vertex_in; + let first_vertex = mesh[vertex.instance_index].first_vertex_index; + let vertex_index = vertex.index - first_vertex; + let weight_count = morph::layer_count(); for (var i: u32 = 0u; i < weight_count; i ++) { let weight = morph::weight_at(i); if weight == 0.0 { continue; } - vertex.position += weight * morph::morph(vertex.index, morph::position_offset, i); + vertex.position += weight * morph::morph(vertex_index, morph::position_offset, i); #ifdef VERTEX_NORMALS - vertex.normal += weight * morph::morph(vertex.index, morph::normal_offset, i); + vertex.normal += weight * morph::morph(vertex_index, morph::normal_offset, i); #endif #ifdef VERTEX_TANGENTS - vertex.tangent += vec4(weight * morph::morph(vertex.index, morph::tangent_offset, i), 0.0); + vertex.tangent += vec4(weight * morph::morph(vertex_index, morph::tangent_offset, i), 0.0); #endif } return vertex; diff --git a/crates/bevy_pbr/src/render/mesh.rs b/crates/bevy_pbr/src/render/mesh.rs index 56f1cd1289..fcc9b6ba78 100644 --- a/crates/bevy_pbr/src/render/mesh.rs +++ b/crates/bevy_pbr/src/render/mesh.rs @@ -186,8 +186,8 @@ impl Plugin for MeshRenderPlugin { if use_gpu_instance_buffer_builder { render_app - .init_resource::>( - ) + .init_resource::>() + .init_resource::() .add_systems( ExtractSchedule, extract_meshes_for_gpu_building.in_set(ExtractMeshesSet), @@ -200,6 +200,9 @@ impl Plugin for MeshRenderPlugin { gpu_preprocessing::delete_old_work_item_buffers:: .in_set(RenderSet::ManageViews) .after(prepare_view_targets), + collect_meshes_for_gpu_building + .in_set(RenderSet::PrepareAssets) + .after(allocator::allocate_and_free_meshes), ), ); } else { @@ -275,6 +278,18 @@ pub struct MeshUniform { // // (MSB: most significant bit; LSB: least significant bit.) pub lightmap_uv_rect: UVec2, + /// The index of this mesh's first vertex in the vertex buffer. + /// + /// Multiple meshes can be packed into a single vertex buffer (see + /// [`MeshAllocator`]). This value stores the offset of the first vertex in + /// this mesh in that buffer. + pub first_vertex_index: u32, + /// Padding. + pub pad_a: u32, + /// Padding. + pub pad_b: u32, + /// Padding. + pub pad_c: u32, } /// Information that has to be transferred from CPU to GPU in order to produce @@ -305,6 +320,18 @@ pub struct MeshInputUniform { /// /// This is used for TAA. If not present, this will be `u32::MAX`. pub previous_input_index: u32, + /// The index of this mesh's first vertex in the vertex buffer. + /// + /// Multiple meshes can be packed into a single vertex buffer (see + /// [`MeshAllocator`]). This value stores the offset of the first vertex in + /// this mesh in that buffer. + pub first_vertex_index: u32, + /// Padding. + pub pad_a: u32, + /// Padding. + pub pad_b: u32, + /// Padding. + pub pad_c: u32, } /// Information about each mesh instance needed to cull it on GPU. @@ -331,7 +358,11 @@ pub struct MeshCullingData { pub struct MeshCullingDataBuffer(RawBufferVec); impl MeshUniform { - pub fn new(mesh_transforms: &MeshTransforms, maybe_lightmap_uv_rect: Option) -> Self { + pub fn new( + mesh_transforms: &MeshTransforms, + first_vertex_index: u32, + maybe_lightmap_uv_rect: Option, + ) -> Self { let (local_from_world_transpose_a, local_from_world_transpose_b) = mesh_transforms.world_from_local.inverse_transpose_3x3(); Self { @@ -341,6 +372,10 @@ impl MeshUniform { local_from_world_transpose_a, local_from_world_transpose_b, flags: mesh_transforms.flags, + first_vertex_index, + pad_a: 0, + pad_b: 0, + pad_c: 0, } } } @@ -515,6 +550,14 @@ pub enum RenderMeshInstanceGpuQueue { GpuCulling(Vec<(Entity, RenderMeshInstanceGpuBuilder, MeshCullingData)>), } +/// The per-thread queues containing mesh instances, populated during the +/// extract phase. +/// +/// These are filled in [`extract_meshes_for_gpu_building`] and consumed in +/// [`collect_meshes_for_gpu_building`]. +#[derive(Resource, Default, Deref, DerefMut)] +pub struct RenderMeshInstanceGpuQueues(Parallel); + impl RenderMeshInstanceShared { fn from_components( previous_transform: Option<&PreviousGlobalTransform>, @@ -719,7 +762,14 @@ impl RenderMeshInstanceGpuBuilder { entity: Entity, render_mesh_instances: &mut EntityHashMap, current_input_buffer: &mut RawBufferVec, + mesh_allocator: &MeshAllocator, ) -> usize { + let first_vertex_index = match mesh_allocator.mesh_vertex_slice(&self.shared.mesh_asset_id) + { + Some(mesh_vertex_slice) => mesh_vertex_slice.range.start, + None => 0, + }; + // Push the mesh input uniform. let current_uniform_index = current_input_buffer.push(MeshInputUniform { world_from_local: self.world_from_local.to_transpose(), @@ -729,6 +779,10 @@ impl RenderMeshInstanceGpuBuilder { Some(previous_input_index) => previous_input_index.into(), None => u32::MAX, }, + first_vertex_index, + pad_a: 0, + pad_b: 0, + pad_c: 0, }); // Record the [`RenderMeshInstance`]. @@ -900,11 +954,7 @@ pub fn extract_meshes_for_cpu_building( pub fn extract_meshes_for_gpu_building( mut render_mesh_instances: ResMut, render_visibility_ranges: Res, - mut batched_instance_buffers: ResMut< - gpu_preprocessing::BatchedInstanceBuffers, - >, - mut mesh_culling_data_buffer: ResMut, - mut render_mesh_instance_queues: Local>, + mut render_mesh_instance_queues: ResMut, meshes_query: Extract< Query<( Entity, @@ -1004,13 +1054,6 @@ pub fn extract_meshes_for_gpu_building( queue.push(entity, gpu_mesh_instance_builder, gpu_mesh_culling_data); }, ); - - collect_meshes_for_gpu_building( - render_mesh_instances, - &mut batched_instance_buffers, - &mut mesh_culling_data_buffer, - &mut render_mesh_instance_queues, - ); } /// A system that sets the [`RenderMeshInstanceFlags`] for each mesh based on @@ -1044,22 +1087,28 @@ fn set_mesh_motion_vector_flags( /// Creates the [`RenderMeshInstanceGpu`]s and [`MeshInputUniform`]s when GPU /// mesh uniforms are built. -fn collect_meshes_for_gpu_building( - render_mesh_instances: &mut RenderMeshInstancesGpu, - batched_instance_buffers: &mut gpu_preprocessing::BatchedInstanceBuffers< - MeshUniform, - MeshInputUniform, +pub fn collect_meshes_for_gpu_building( + render_mesh_instances: ResMut, + batched_instance_buffers: ResMut< + gpu_preprocessing::BatchedInstanceBuffers, >, - mesh_culling_data_buffer: &mut MeshCullingDataBuffer, - render_mesh_instance_queues: &mut Parallel, + mut mesh_culling_data_buffer: ResMut, + mut render_mesh_instance_queues: ResMut, + mesh_allocator: Res, ) { + let RenderMeshInstances::GpuBuilding(ref mut render_mesh_instances) = + render_mesh_instances.into_inner() + else { + return; + }; + // Collect render mesh instances. Build up the uniform buffer. let gpu_preprocessing::BatchedInstanceBuffers { ref mut current_input_buffer, ref mut previous_input_buffer, .. - } = batched_instance_buffers; + } = batched_instance_buffers.into_inner(); // Swap buffers. mem::swap(current_input_buffer, previous_input_buffer); @@ -1076,8 +1125,9 @@ fn collect_meshes_for_gpu_building( for (entity, mesh_instance_builder) in queue.drain(..) { mesh_instance_builder.add_to( entity, - render_mesh_instances, + &mut *render_mesh_instances, current_input_buffer, + &mesh_allocator, ); } } @@ -1085,10 +1135,12 @@ fn collect_meshes_for_gpu_building( for (entity, mesh_instance_builder, mesh_culling_builder) in queue.drain(..) { let instance_data_index = mesh_instance_builder.add_to( entity, - render_mesh_instances, + &mut *render_mesh_instances, current_input_buffer, + &mesh_allocator, ); - let culling_data_index = mesh_culling_builder.add_to(mesh_culling_data_buffer); + let culling_data_index = + mesh_culling_builder.add_to(&mut mesh_culling_data_buffer); debug_assert_eq!(instance_data_index, culling_data_index); } } @@ -1220,7 +1272,7 @@ impl GetBatchData for MeshPipeline { type BufferData = MeshUniform; fn get_batch_data( - (mesh_instances, lightmaps, _, _): &SystemParamItem, + (mesh_instances, lightmaps, _, mesh_allocator): &SystemParamItem, entity: Entity, ) -> Option<(Self::BufferData, Option)> { let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else { @@ -1231,11 +1283,17 @@ impl GetBatchData for MeshPipeline { return None; }; let mesh_instance = mesh_instances.get(&entity)?; + let first_vertex_index = + match mesh_allocator.mesh_vertex_slice(&mesh_instance.mesh_asset_id) { + Some(mesh_vertex_slice) => mesh_vertex_slice.range.start, + None => 0, + }; let maybe_lightmap = lightmaps.render_lightmaps.get(&entity); Some(( MeshUniform::new( &mesh_instance.transforms, + first_vertex_index, maybe_lightmap.map(|lightmap| lightmap.uv_rect), ), mesh_instance.should_batch().then_some(( @@ -1277,7 +1335,7 @@ impl GetFullBatchData for MeshPipeline { } fn get_binned_batch_data( - (mesh_instances, lightmaps, _, _): &SystemParamItem, + (mesh_instances, lightmaps, _, mesh_allocator): &SystemParamItem, entity: Entity, ) -> Option { let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else { @@ -1287,10 +1345,16 @@ impl GetFullBatchData for MeshPipeline { return None; }; let mesh_instance = mesh_instances.get(&entity)?; + let first_vertex_index = + match mesh_allocator.mesh_vertex_slice(&mesh_instance.mesh_asset_id) { + Some(mesh_vertex_slice) => mesh_vertex_slice.range.start, + None => 0, + }; let maybe_lightmap = lightmaps.render_lightmaps.get(&entity); Some(MeshUniform::new( &mesh_instance.transforms, + first_vertex_index, maybe_lightmap.map(|lightmap| lightmap.uv_rect), )) } @@ -2354,7 +2418,7 @@ impl RenderCommand

for DrawMesh { } RenderMeshBufferInfo::NonIndexed => match indirect_parameters { None => { - pass.draw(0..gpu_mesh.vertex_count, batch_range.clone()); + pass.draw(vertex_buffer_slice.range, batch_range.clone()); } Some((indirect_parameters_offset, indirect_parameters_buffer)) => { pass.draw_indirect(indirect_parameters_buffer, indirect_parameters_offset); diff --git a/crates/bevy_pbr/src/render/mesh.wgsl b/crates/bevy_pbr/src/render/mesh.wgsl index f4386d97ed..7d617755ad 100644 --- a/crates/bevy_pbr/src/render/mesh.wgsl +++ b/crates/bevy_pbr/src/render/mesh.wgsl @@ -1,4 +1,5 @@ #import bevy_pbr::{ + mesh_bindings::mesh, mesh_functions, skinning, morph::morph, @@ -9,18 +10,21 @@ #ifdef MORPH_TARGETS fn morph_vertex(vertex_in: Vertex) -> Vertex { var vertex = vertex_in; + let first_vertex = mesh[vertex.instance_index].first_vertex_index; + let vertex_index = vertex.index - first_vertex; + let weight_count = bevy_pbr::morph::layer_count(); for (var i: u32 = 0u; i < weight_count; i ++) { let weight = bevy_pbr::morph::weight_at(i); if weight == 0.0 { continue; } - vertex.position += weight * morph(vertex.index, bevy_pbr::morph::position_offset, i); + vertex.position += weight * morph(vertex_index, bevy_pbr::morph::position_offset, i); #ifdef VERTEX_NORMALS - vertex.normal += weight * morph(vertex.index, bevy_pbr::morph::normal_offset, i); + vertex.normal += weight * morph(vertex_index, bevy_pbr::morph::normal_offset, i); #endif #ifdef VERTEX_TANGENTS - vertex.tangent += vec4(weight * morph(vertex.index, bevy_pbr::morph::tangent_offset, i), 0.0); + vertex.tangent += vec4(weight * morph(vertex_index, bevy_pbr::morph::tangent_offset, i), 0.0); #endif } return vertex; diff --git a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl index 02d8042d5f..6a5a1fcf06 100644 --- a/crates/bevy_pbr/src/render/mesh_preprocess.wgsl +++ b/crates/bevy_pbr/src/render/mesh_preprocess.wgsl @@ -22,6 +22,10 @@ struct MeshInput { // The index of this mesh's `MeshInput` in the `previous_input` array, if // applicable. If not present, this is `u32::MAX`. previous_input_index: u32, + first_vertex_index: u32, + pad_a: u32, + pad_b: u32, + pad_c: u32, } // Information about each mesh instance needed to cull it on GPU. @@ -186,4 +190,5 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3) { output[mesh_output_index].local_from_world_transpose_b = local_from_world_transpose_b; output[mesh_output_index].flags = current_input[input_index].flags; output[mesh_output_index].lightmap_uv_rect = current_input[input_index].lightmap_uv_rect; + output[mesh_output_index].first_vertex_index = current_input[input_index].first_vertex_index; } diff --git a/crates/bevy_pbr/src/render/mesh_types.wgsl b/crates/bevy_pbr/src/render/mesh_types.wgsl index 7b45bac0ca..57576a3bb3 100644 --- a/crates/bevy_pbr/src/render/mesh_types.wgsl +++ b/crates/bevy_pbr/src/render/mesh_types.wgsl @@ -15,6 +15,11 @@ struct Mesh { // 'flags' is a bit field indicating various options. u32 is 32 bits so we have up to 32 options. flags: u32, lightmap_uv_rect: vec2, + // The index of the mesh's first vertex in the vertex buffer. + first_vertex_index: u32, + pad_a: u32, + pad_b: u32, + pad_c: u32, }; #ifdef SKINNED diff --git a/crates/bevy_pbr/src/volumetric_fog/render.rs b/crates/bevy_pbr/src/volumetric_fog/render.rs index c1420f32a2..de158762d5 100644 --- a/crates/bevy_pbr/src/volumetric_fog/render.rs +++ b/crates/bevy_pbr/src/volumetric_fog/render.rs @@ -472,10 +472,14 @@ impl ViewNode for VolumetricFogNode { render_pass .set_index_buffer(*index_buffer_slice.buffer.slice(..), *index_format); - render_pass.draw_indexed(0..*count, 0, 0..1); + render_pass.draw_indexed( + index_buffer_slice.range.start..(index_buffer_slice.range.start + count), + vertex_buffer_slice.range.start as i32, + 0..1, + ); } RenderMeshBufferInfo::NonIndexed => { - render_pass.draw(0..render_mesh.vertex_count, 0..1); + render_pass.draw(vertex_buffer_slice.range, 0..1); } } }