Fix the example regressions from packed growable buffers. (#14375)

The "uberbuffers" PR #14257 caused some examples to fail intermittently
for different reasons:

1. `morph_targets` could fail because vertex displacements for morph
targets are keyed off the vertex index. With buffer packing, the vertex
index can vary based on the position in the buffer, which caused the
morph targets to be potentially incorrect. The solution is to include
the first vertex index with the `MeshUniform` (and `MeshInputUniform` if
GPU preprocessing is in use), so that the shader can calculate the true
vertex index before performing the morph operation. This results in
wasted space in `MeshUniform`, which is unfortunate, but we'll soon be
filling in the padding with the ID of the material when bindless
textures land, so this had to happen sooner or later anyhow.

Including the vertex index in the `MeshInputUniform` caused an ordering
problem. The `MeshInputUniform` was created during the extraction phase,
before the allocations occurred, so the extraction logic didn't know
where the mesh vertex data was going to end up. The solution is to move
the `MeshInputUniform` creation (the `collect_meshes_for_gpu_building`
system) to after the allocations phase. This should be better for
parallelism anyhow, because it allows the extraction phase to finish
quicker. It's also something we'll have to do for bindless in any event.

2. The `lines` and `fog_volumes` examples could fail because their
custom drawing nodes weren't updated to supply the vertex and index
offsets in their `draw_indexed` and `draw` calls. This commit fixes this
oversight.

Fixes #14366.
This commit is contained in:
Patrick Walton 2024-07-22 12:55:51 -06:00 committed by GitHub
parent d30391b583
commit d235d41af1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 124 additions and 38 deletions

View file

@ -138,7 +138,7 @@ pub fn extract_meshlet_meshes(
gpu_scene
.instance_uniforms
.get_mut()
.push(MeshUniform::new(&transforms, None));
.push(MeshUniform::new(&transforms, 0, None));
}
}

View file

@ -1,5 +1,6 @@
#import bevy_pbr::{
prepass_bindings,
mesh_bindings::mesh,
mesh_functions,
prepass_io::{Vertex, VertexOutput, FragmentOutput},
skinning,
@ -15,18 +16,21 @@
#ifdef MORPH_TARGETS
fn morph_vertex(vertex_in: Vertex) -> Vertex {
var vertex = vertex_in;
let first_vertex = mesh[vertex.instance_index].first_vertex_index;
let vertex_index = vertex.index - first_vertex;
let weight_count = morph::layer_count();
for (var i: u32 = 0u; i < weight_count; i ++) {
let weight = morph::weight_at(i);
if weight == 0.0 {
continue;
}
vertex.position += weight * morph::morph(vertex.index, morph::position_offset, i);
vertex.position += weight * morph::morph(vertex_index, morph::position_offset, i);
#ifdef VERTEX_NORMALS
vertex.normal += weight * morph::morph(vertex.index, morph::normal_offset, i);
vertex.normal += weight * morph::morph(vertex_index, morph::normal_offset, i);
#endif
#ifdef VERTEX_TANGENTS
vertex.tangent += vec4(weight * morph::morph(vertex.index, morph::tangent_offset, i), 0.0);
vertex.tangent += vec4(weight * morph::morph(vertex_index, morph::tangent_offset, i), 0.0);
#endif
}
return vertex;

View file

@ -186,8 +186,8 @@ impl Plugin for MeshRenderPlugin {
if use_gpu_instance_buffer_builder {
render_app
.init_resource::<gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>(
)
.init_resource::<gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>>()
.init_resource::<RenderMeshInstanceGpuQueues>()
.add_systems(
ExtractSchedule,
extract_meshes_for_gpu_building.in_set(ExtractMeshesSet),
@ -200,6 +200,9 @@ impl Plugin for MeshRenderPlugin {
gpu_preprocessing::delete_old_work_item_buffers::<MeshPipeline>
.in_set(RenderSet::ManageViews)
.after(prepare_view_targets),
collect_meshes_for_gpu_building
.in_set(RenderSet::PrepareAssets)
.after(allocator::allocate_and_free_meshes),
),
);
} else {
@ -275,6 +278,18 @@ pub struct MeshUniform {
//
// (MSB: most significant bit; LSB: least significant bit.)
pub lightmap_uv_rect: UVec2,
/// The index of this mesh's first vertex in the vertex buffer.
///
/// Multiple meshes can be packed into a single vertex buffer (see
/// [`MeshAllocator`]). This value stores the offset of the first vertex in
/// this mesh in that buffer.
pub first_vertex_index: u32,
/// Padding.
pub pad_a: u32,
/// Padding.
pub pad_b: u32,
/// Padding.
pub pad_c: u32,
}
/// Information that has to be transferred from CPU to GPU in order to produce
@ -305,6 +320,18 @@ pub struct MeshInputUniform {
///
/// This is used for TAA. If not present, this will be `u32::MAX`.
pub previous_input_index: u32,
/// The index of this mesh's first vertex in the vertex buffer.
///
/// Multiple meshes can be packed into a single vertex buffer (see
/// [`MeshAllocator`]). This value stores the offset of the first vertex in
/// this mesh in that buffer.
pub first_vertex_index: u32,
/// Padding.
pub pad_a: u32,
/// Padding.
pub pad_b: u32,
/// Padding.
pub pad_c: u32,
}
/// Information about each mesh instance needed to cull it on GPU.
@ -331,7 +358,11 @@ pub struct MeshCullingData {
pub struct MeshCullingDataBuffer(RawBufferVec<MeshCullingData>);
impl MeshUniform {
pub fn new(mesh_transforms: &MeshTransforms, maybe_lightmap_uv_rect: Option<Rect>) -> Self {
pub fn new(
mesh_transforms: &MeshTransforms,
first_vertex_index: u32,
maybe_lightmap_uv_rect: Option<Rect>,
) -> Self {
let (local_from_world_transpose_a, local_from_world_transpose_b) =
mesh_transforms.world_from_local.inverse_transpose_3x3();
Self {
@ -341,6 +372,10 @@ impl MeshUniform {
local_from_world_transpose_a,
local_from_world_transpose_b,
flags: mesh_transforms.flags,
first_vertex_index,
pad_a: 0,
pad_b: 0,
pad_c: 0,
}
}
}
@ -515,6 +550,14 @@ pub enum RenderMeshInstanceGpuQueue {
GpuCulling(Vec<(Entity, RenderMeshInstanceGpuBuilder, MeshCullingData)>),
}
/// The per-thread queues containing mesh instances, populated during the
/// extract phase.
///
/// These are filled in [`extract_meshes_for_gpu_building`] and consumed in
/// [`collect_meshes_for_gpu_building`].
#[derive(Resource, Default, Deref, DerefMut)]
pub struct RenderMeshInstanceGpuQueues(Parallel<RenderMeshInstanceGpuQueue>);
impl RenderMeshInstanceShared {
fn from_components(
previous_transform: Option<&PreviousGlobalTransform>,
@ -719,7 +762,14 @@ impl RenderMeshInstanceGpuBuilder {
entity: Entity,
render_mesh_instances: &mut EntityHashMap<RenderMeshInstanceGpu>,
current_input_buffer: &mut RawBufferVec<MeshInputUniform>,
mesh_allocator: &MeshAllocator,
) -> usize {
let first_vertex_index = match mesh_allocator.mesh_vertex_slice(&self.shared.mesh_asset_id)
{
Some(mesh_vertex_slice) => mesh_vertex_slice.range.start,
None => 0,
};
// Push the mesh input uniform.
let current_uniform_index = current_input_buffer.push(MeshInputUniform {
world_from_local: self.world_from_local.to_transpose(),
@ -729,6 +779,10 @@ impl RenderMeshInstanceGpuBuilder {
Some(previous_input_index) => previous_input_index.into(),
None => u32::MAX,
},
first_vertex_index,
pad_a: 0,
pad_b: 0,
pad_c: 0,
});
// Record the [`RenderMeshInstance`].
@ -900,11 +954,7 @@ pub fn extract_meshes_for_cpu_building(
pub fn extract_meshes_for_gpu_building(
mut render_mesh_instances: ResMut<RenderMeshInstances>,
render_visibility_ranges: Res<RenderVisibilityRanges>,
mut batched_instance_buffers: ResMut<
gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>,
>,
mut mesh_culling_data_buffer: ResMut<MeshCullingDataBuffer>,
mut render_mesh_instance_queues: Local<Parallel<RenderMeshInstanceGpuQueue>>,
mut render_mesh_instance_queues: ResMut<RenderMeshInstanceGpuQueues>,
meshes_query: Extract<
Query<(
Entity,
@ -1004,13 +1054,6 @@ pub fn extract_meshes_for_gpu_building(
queue.push(entity, gpu_mesh_instance_builder, gpu_mesh_culling_data);
},
);
collect_meshes_for_gpu_building(
render_mesh_instances,
&mut batched_instance_buffers,
&mut mesh_culling_data_buffer,
&mut render_mesh_instance_queues,
);
}
/// A system that sets the [`RenderMeshInstanceFlags`] for each mesh based on
@ -1044,22 +1087,28 @@ fn set_mesh_motion_vector_flags(
/// Creates the [`RenderMeshInstanceGpu`]s and [`MeshInputUniform`]s when GPU
/// mesh uniforms are built.
fn collect_meshes_for_gpu_building(
render_mesh_instances: &mut RenderMeshInstancesGpu,
batched_instance_buffers: &mut gpu_preprocessing::BatchedInstanceBuffers<
MeshUniform,
MeshInputUniform,
pub fn collect_meshes_for_gpu_building(
render_mesh_instances: ResMut<RenderMeshInstances>,
batched_instance_buffers: ResMut<
gpu_preprocessing::BatchedInstanceBuffers<MeshUniform, MeshInputUniform>,
>,
mesh_culling_data_buffer: &mut MeshCullingDataBuffer,
render_mesh_instance_queues: &mut Parallel<RenderMeshInstanceGpuQueue>,
mut mesh_culling_data_buffer: ResMut<MeshCullingDataBuffer>,
mut render_mesh_instance_queues: ResMut<RenderMeshInstanceGpuQueues>,
mesh_allocator: Res<MeshAllocator>,
) {
let RenderMeshInstances::GpuBuilding(ref mut render_mesh_instances) =
render_mesh_instances.into_inner()
else {
return;
};
// Collect render mesh instances. Build up the uniform buffer.
let gpu_preprocessing::BatchedInstanceBuffers {
ref mut current_input_buffer,
ref mut previous_input_buffer,
..
} = batched_instance_buffers;
} = batched_instance_buffers.into_inner();
// Swap buffers.
mem::swap(current_input_buffer, previous_input_buffer);
@ -1076,8 +1125,9 @@ fn collect_meshes_for_gpu_building(
for (entity, mesh_instance_builder) in queue.drain(..) {
mesh_instance_builder.add_to(
entity,
render_mesh_instances,
&mut *render_mesh_instances,
current_input_buffer,
&mesh_allocator,
);
}
}
@ -1085,10 +1135,12 @@ fn collect_meshes_for_gpu_building(
for (entity, mesh_instance_builder, mesh_culling_builder) in queue.drain(..) {
let instance_data_index = mesh_instance_builder.add_to(
entity,
render_mesh_instances,
&mut *render_mesh_instances,
current_input_buffer,
&mesh_allocator,
);
let culling_data_index = mesh_culling_builder.add_to(mesh_culling_data_buffer);
let culling_data_index =
mesh_culling_builder.add_to(&mut mesh_culling_data_buffer);
debug_assert_eq!(instance_data_index, culling_data_index);
}
}
@ -1220,7 +1272,7 @@ impl GetBatchData for MeshPipeline {
type BufferData = MeshUniform;
fn get_batch_data(
(mesh_instances, lightmaps, _, _): &SystemParamItem<Self::Param>,
(mesh_instances, lightmaps, _, mesh_allocator): &SystemParamItem<Self::Param>,
entity: Entity,
) -> Option<(Self::BufferData, Option<Self::CompareData>)> {
let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
@ -1231,11 +1283,17 @@ impl GetBatchData for MeshPipeline {
return None;
};
let mesh_instance = mesh_instances.get(&entity)?;
let first_vertex_index =
match mesh_allocator.mesh_vertex_slice(&mesh_instance.mesh_asset_id) {
Some(mesh_vertex_slice) => mesh_vertex_slice.range.start,
None => 0,
};
let maybe_lightmap = lightmaps.render_lightmaps.get(&entity);
Some((
MeshUniform::new(
&mesh_instance.transforms,
first_vertex_index,
maybe_lightmap.map(|lightmap| lightmap.uv_rect),
),
mesh_instance.should_batch().then_some((
@ -1277,7 +1335,7 @@ impl GetFullBatchData for MeshPipeline {
}
fn get_binned_batch_data(
(mesh_instances, lightmaps, _, _): &SystemParamItem<Self::Param>,
(mesh_instances, lightmaps, _, mesh_allocator): &SystemParamItem<Self::Param>,
entity: Entity,
) -> Option<Self::BufferData> {
let RenderMeshInstances::CpuBuilding(ref mesh_instances) = **mesh_instances else {
@ -1287,10 +1345,16 @@ impl GetFullBatchData for MeshPipeline {
return None;
};
let mesh_instance = mesh_instances.get(&entity)?;
let first_vertex_index =
match mesh_allocator.mesh_vertex_slice(&mesh_instance.mesh_asset_id) {
Some(mesh_vertex_slice) => mesh_vertex_slice.range.start,
None => 0,
};
let maybe_lightmap = lightmaps.render_lightmaps.get(&entity);
Some(MeshUniform::new(
&mesh_instance.transforms,
first_vertex_index,
maybe_lightmap.map(|lightmap| lightmap.uv_rect),
))
}
@ -2354,7 +2418,7 @@ impl<P: PhaseItem> RenderCommand<P> for DrawMesh {
}
RenderMeshBufferInfo::NonIndexed => match indirect_parameters {
None => {
pass.draw(0..gpu_mesh.vertex_count, batch_range.clone());
pass.draw(vertex_buffer_slice.range, batch_range.clone());
}
Some((indirect_parameters_offset, indirect_parameters_buffer)) => {
pass.draw_indirect(indirect_parameters_buffer, indirect_parameters_offset);

View file

@ -1,4 +1,5 @@
#import bevy_pbr::{
mesh_bindings::mesh,
mesh_functions,
skinning,
morph::morph,
@ -9,18 +10,21 @@
#ifdef MORPH_TARGETS
fn morph_vertex(vertex_in: Vertex) -> Vertex {
var vertex = vertex_in;
let first_vertex = mesh[vertex.instance_index].first_vertex_index;
let vertex_index = vertex.index - first_vertex;
let weight_count = bevy_pbr::morph::layer_count();
for (var i: u32 = 0u; i < weight_count; i ++) {
let weight = bevy_pbr::morph::weight_at(i);
if weight == 0.0 {
continue;
}
vertex.position += weight * morph(vertex.index, bevy_pbr::morph::position_offset, i);
vertex.position += weight * morph(vertex_index, bevy_pbr::morph::position_offset, i);
#ifdef VERTEX_NORMALS
vertex.normal += weight * morph(vertex.index, bevy_pbr::morph::normal_offset, i);
vertex.normal += weight * morph(vertex_index, bevy_pbr::morph::normal_offset, i);
#endif
#ifdef VERTEX_TANGENTS
vertex.tangent += vec4(weight * morph(vertex.index, bevy_pbr::morph::tangent_offset, i), 0.0);
vertex.tangent += vec4(weight * morph(vertex_index, bevy_pbr::morph::tangent_offset, i), 0.0);
#endif
}
return vertex;

View file

@ -22,6 +22,10 @@ struct MeshInput {
// The index of this mesh's `MeshInput` in the `previous_input` array, if
// applicable. If not present, this is `u32::MAX`.
previous_input_index: u32,
first_vertex_index: u32,
pad_a: u32,
pad_b: u32,
pad_c: u32,
}
// Information about each mesh instance needed to cull it on GPU.
@ -186,4 +190,5 @@ fn main(@builtin(global_invocation_id) global_invocation_id: vec3<u32>) {
output[mesh_output_index].local_from_world_transpose_b = local_from_world_transpose_b;
output[mesh_output_index].flags = current_input[input_index].flags;
output[mesh_output_index].lightmap_uv_rect = current_input[input_index].lightmap_uv_rect;
output[mesh_output_index].first_vertex_index = current_input[input_index].first_vertex_index;
}

View file

@ -15,6 +15,11 @@ struct Mesh {
// 'flags' is a bit field indicating various options. u32 is 32 bits so we have up to 32 options.
flags: u32,
lightmap_uv_rect: vec2<u32>,
// The index of the mesh's first vertex in the vertex buffer.
first_vertex_index: u32,
pad_a: u32,
pad_b: u32,
pad_c: u32,
};
#ifdef SKINNED

View file

@ -472,10 +472,14 @@ impl ViewNode for VolumetricFogNode {
render_pass
.set_index_buffer(*index_buffer_slice.buffer.slice(..), *index_format);
render_pass.draw_indexed(0..*count, 0, 0..1);
render_pass.draw_indexed(
index_buffer_slice.range.start..(index_buffer_slice.range.start + count),
vertex_buffer_slice.range.start as i32,
0..1,
);
}
RenderMeshBufferInfo::NonIndexed => {
render_pass.draw(0..render_mesh.vertex_count, 0..1);
render_pass.draw(vertex_buffer_slice.range, 0..1);
}
}
}