mirror of
https://github.com/bevyengine/bevy
synced 2024-12-26 21:13:09 +00:00
a0faf9cd01
### Builder changes - Increased meshlet max vertices/triangles from 64v/64t to 255v/128t (meshoptimizer won't allow 256v sadly). This gives us a much greater percentage of meshlets with max triangle count (128). Still not perfect, we still end up with some tiny <=10 triangle meshlets that never really get simplified, but it's progress. - Removed the error target limit. Now we allow meshoptimizer to simplify as much as possible. No reason to cap this out, as the cluster culling code will choose a good LOD level anyways. Again leads to higher quality LOD trees. - After some discussion and consulting the Nanite slides again, changed meshlet group error from _adding_ the max child's error to the group error, to doing `group_error = max(group_error, max_child_error)`. Error is already cumulative between LODs as the edges we're collapsing during simplification get longer each time. - Bumped the 65% simplification threshold to allow up to 95% of the original geometry (e.g. accept simplification as valid even if we only simplified 5% of the triangles). This gives us closer to log2(initial_meshlet_count) LOD levels, and fewer meshlet roots in the DAG. Still more work to be done in the future here. Maybe trying METIS for meshlet building instead of meshoptimizer. Using ~8 clusters per group instead of ~4 might also make a big difference. The Nanite slides say that they have 8-32 meshlets per group, suggesting some kind of heuristic. Unfortunately meshopt's compute_cluster_bounds won't work with large groups atm (https://github.com/zeux/meshoptimizer/discussions/750#discussioncomment-10562641) so hard to test. Based on discussion from https://github.com/bevyengine/bevy/discussions/14998, https://github.com/zeux/meshoptimizer/discussions/750, and discord. ### Runtime changes - cluster:triangle packed IDs are now stored 25:7 instead of 26:6 bits, as max triangles per cluster are now 128 instead of 64 - Hardware raster now spawns 128 * 3 vertices instead of 64 * 3 vertices to account for the new max triangles limit - Hardware raster now outputs NaN triangles (0 / 0) instead of zero-positioned triangles for extra vertex invocations over the cluster triangle count. Shouldn't really be a difference idt, but I did it anyways. - Software raster now does 128 threads per workgroup instead of 64 threads. Each thread now loads, projects, and caches a vertex (vertices 0-127), and then if needed does so again (vertices 128-254). Each thread then rasterizes one of 128 triangles. - Fixed a bug with `needs_dispatch_remap`. I had the condition backwards in my last PR, I probably committed it by accident after testing the non-default code path on my GPU.
203 lines
8.4 KiB
WebGPU Shading Language
203 lines
8.4 KiB
WebGPU Shading Language
#define_import_path bevy_pbr::meshlet_visibility_buffer_resolve
|
|
|
|
#import bevy_pbr::{
|
|
meshlet_bindings::{
|
|
meshlet_visibility_buffer,
|
|
meshlet_cluster_meshlet_ids,
|
|
meshlets,
|
|
meshlet_vertex_ids,
|
|
meshlet_vertex_data,
|
|
meshlet_cluster_instance_ids,
|
|
meshlet_instance_uniforms,
|
|
get_meshlet_index,
|
|
unpack_meshlet_vertex,
|
|
},
|
|
mesh_view_bindings::view,
|
|
mesh_functions::{mesh_position_local_to_world, sign_determinant_model_3x3m},
|
|
mesh_types::{Mesh, MESH_FLAGS_SIGN_DETERMINANT_MODEL_3X3_BIT},
|
|
view_transformations::{position_world_to_clip, frag_coord_to_ndc},
|
|
}
|
|
#import bevy_render::maths::{affine3_to_square, mat2x4_f32_to_mat3x3_unpack}
|
|
|
|
#ifdef PREPASS_FRAGMENT
|
|
#ifdef MOTION_VECTOR_PREPASS
|
|
#import bevy_pbr::{
|
|
prepass_bindings::previous_view_uniforms,
|
|
pbr_prepass_functions::calculate_motion_vector,
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
/// Functions to be used by materials for reading from a meshlet visibility buffer texture.
|
|
|
|
#ifdef MESHLET_MESH_MATERIAL_PASS
|
|
struct PartialDerivatives {
|
|
barycentrics: vec3<f32>,
|
|
ddx: vec3<f32>,
|
|
ddy: vec3<f32>,
|
|
}
|
|
|
|
// https://github.com/ConfettiFX/The-Forge/blob/2d453f376ef278f66f97cbaf36c0d12e4361e275/Examples_3/Visibility_Buffer/src/Shaders/FSL/visibilityBuffer_shade.frag.fsl#L83-L139
|
|
fn compute_partial_derivatives(vertex_clip_positions: array<vec4<f32>, 3>, ndc_uv: vec2<f32>, screen_size: vec2<f32>) -> PartialDerivatives {
|
|
var result: PartialDerivatives;
|
|
|
|
let inv_w = 1.0 / vec3(vertex_clip_positions[0].w, vertex_clip_positions[1].w, vertex_clip_positions[2].w);
|
|
let ndc_0 = vertex_clip_positions[0].xy * inv_w[0];
|
|
let ndc_1 = vertex_clip_positions[1].xy * inv_w[1];
|
|
let ndc_2 = vertex_clip_positions[2].xy * inv_w[2];
|
|
|
|
let inv_det = 1.0 / determinant(mat2x2(ndc_2 - ndc_1, ndc_0 - ndc_1));
|
|
result.ddx = vec3(ndc_1.y - ndc_2.y, ndc_2.y - ndc_0.y, ndc_0.y - ndc_1.y) * inv_det * inv_w;
|
|
result.ddy = vec3(ndc_2.x - ndc_1.x, ndc_0.x - ndc_2.x, ndc_1.x - ndc_0.x) * inv_det * inv_w;
|
|
|
|
var ddx_sum = dot(result.ddx, vec3(1.0));
|
|
var ddy_sum = dot(result.ddy, vec3(1.0));
|
|
|
|
let delta_v = ndc_uv - ndc_0;
|
|
let interp_inv_w = inv_w.x + delta_v.x * ddx_sum + delta_v.y * ddy_sum;
|
|
let interp_w = 1.0 / interp_inv_w;
|
|
|
|
result.barycentrics = vec3(
|
|
interp_w * (delta_v.x * result.ddx.x + delta_v.y * result.ddy.x + inv_w.x),
|
|
interp_w * (delta_v.x * result.ddx.y + delta_v.y * result.ddy.y),
|
|
interp_w * (delta_v.x * result.ddx.z + delta_v.y * result.ddy.z),
|
|
);
|
|
|
|
result.ddx *= 2.0 / screen_size.x;
|
|
result.ddy *= 2.0 / screen_size.y;
|
|
ddx_sum *= 2.0 / screen_size.x;
|
|
ddy_sum *= 2.0 / screen_size.y;
|
|
|
|
let interp_ddx_w = 1.0 / (interp_inv_w + ddx_sum);
|
|
let interp_ddy_w = 1.0 / (interp_inv_w + ddy_sum);
|
|
|
|
result.ddx = interp_ddx_w * (result.barycentrics * interp_inv_w + result.ddx) - result.barycentrics;
|
|
result.ddy = interp_ddy_w * (result.barycentrics * interp_inv_w + result.ddy) - result.barycentrics;
|
|
return result;
|
|
}
|
|
|
|
struct VertexOutput {
|
|
position: vec4<f32>,
|
|
world_position: vec4<f32>,
|
|
world_normal: vec3<f32>,
|
|
uv: vec2<f32>,
|
|
ddx_uv: vec2<f32>,
|
|
ddy_uv: vec2<f32>,
|
|
world_tangent: vec4<f32>,
|
|
mesh_flags: u32,
|
|
cluster_id: u32,
|
|
#ifdef PREPASS_FRAGMENT
|
|
#ifdef MOTION_VECTOR_PREPASS
|
|
motion_vector: vec2<f32>,
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
/// Load the visibility buffer texture and resolve it into a VertexOutput.
|
|
fn resolve_vertex_output(frag_coord: vec4<f32>) -> VertexOutput {
|
|
let frag_coord_1d = u32(frag_coord.y) * u32(view.viewport.z) + u32(frag_coord.x);
|
|
let packed_ids = u32(meshlet_visibility_buffer[frag_coord_1d]); // TODO: Might be faster to load the correct u32 directly
|
|
let cluster_id = packed_ids >> 7u;
|
|
let meshlet_id = meshlet_cluster_meshlet_ids[cluster_id];
|
|
let meshlet = meshlets[meshlet_id];
|
|
|
|
let triangle_id = extractBits(packed_ids, 0u, 7u);
|
|
let index_ids = meshlet.start_index_id + (triangle_id * 3u) + vec3(0u, 1u, 2u);
|
|
let indices = meshlet.start_vertex_id + vec3(get_meshlet_index(index_ids.x), get_meshlet_index(index_ids.y), get_meshlet_index(index_ids.z));
|
|
let vertex_ids = vec3(meshlet_vertex_ids[indices.x], meshlet_vertex_ids[indices.y], meshlet_vertex_ids[indices.z]);
|
|
let vertex_1 = unpack_meshlet_vertex(meshlet_vertex_data[vertex_ids.x]);
|
|
let vertex_2 = unpack_meshlet_vertex(meshlet_vertex_data[vertex_ids.y]);
|
|
let vertex_3 = unpack_meshlet_vertex(meshlet_vertex_data[vertex_ids.z]);
|
|
|
|
let instance_id = meshlet_cluster_instance_ids[cluster_id];
|
|
var instance_uniform = meshlet_instance_uniforms[instance_id];
|
|
|
|
let world_from_local = affine3_to_square(instance_uniform.world_from_local);
|
|
let world_position_1 = mesh_position_local_to_world(world_from_local, vec4(vertex_1.position, 1.0));
|
|
let world_position_2 = mesh_position_local_to_world(world_from_local, vec4(vertex_2.position, 1.0));
|
|
let world_position_3 = mesh_position_local_to_world(world_from_local, vec4(vertex_3.position, 1.0));
|
|
|
|
let clip_position_1 = position_world_to_clip(world_position_1.xyz);
|
|
let clip_position_2 = position_world_to_clip(world_position_2.xyz);
|
|
let clip_position_3 = position_world_to_clip(world_position_3.xyz);
|
|
let frag_coord_ndc = frag_coord_to_ndc(frag_coord).xy;
|
|
let partial_derivatives = compute_partial_derivatives(
|
|
array(clip_position_1, clip_position_2, clip_position_3),
|
|
frag_coord_ndc,
|
|
view.viewport.zw,
|
|
);
|
|
|
|
let world_position = mat3x4(world_position_1, world_position_2, world_position_3) * partial_derivatives.barycentrics;
|
|
let world_normal = mat3x3(
|
|
normal_local_to_world(vertex_1.normal, &instance_uniform),
|
|
normal_local_to_world(vertex_2.normal, &instance_uniform),
|
|
normal_local_to_world(vertex_3.normal, &instance_uniform),
|
|
) * partial_derivatives.barycentrics;
|
|
let uv = mat3x2(vertex_1.uv, vertex_2.uv, vertex_3.uv) * partial_derivatives.barycentrics;
|
|
let ddx_uv = mat3x2(vertex_1.uv, vertex_2.uv, vertex_3.uv) * partial_derivatives.ddx;
|
|
let ddy_uv = mat3x2(vertex_1.uv, vertex_2.uv, vertex_3.uv) * partial_derivatives.ddy;
|
|
let world_tangent = mat3x4(
|
|
tangent_local_to_world(vertex_1.tangent, world_from_local, instance_uniform.flags),
|
|
tangent_local_to_world(vertex_2.tangent, world_from_local, instance_uniform.flags),
|
|
tangent_local_to_world(vertex_3.tangent, world_from_local, instance_uniform.flags),
|
|
) * partial_derivatives.barycentrics;
|
|
|
|
#ifdef PREPASS_FRAGMENT
|
|
#ifdef MOTION_VECTOR_PREPASS
|
|
let previous_world_from_local = affine3_to_square(instance_uniform.previous_world_from_local);
|
|
let previous_world_position_1 = mesh_position_local_to_world(previous_world_from_local, vec4(vertex_1.position, 1.0));
|
|
let previous_world_position_2 = mesh_position_local_to_world(previous_world_from_local, vec4(vertex_2.position, 1.0));
|
|
let previous_world_position_3 = mesh_position_local_to_world(previous_world_from_local, vec4(vertex_3.position, 1.0));
|
|
let previous_world_position = mat3x4(previous_world_position_1, previous_world_position_2, previous_world_position_3) * partial_derivatives.barycentrics;
|
|
let motion_vector = calculate_motion_vector(world_position, previous_world_position);
|
|
#endif
|
|
#endif
|
|
|
|
return VertexOutput(
|
|
frag_coord,
|
|
world_position,
|
|
world_normal,
|
|
uv,
|
|
ddx_uv,
|
|
ddy_uv,
|
|
world_tangent,
|
|
instance_uniform.flags,
|
|
cluster_id,
|
|
#ifdef PREPASS_FRAGMENT
|
|
#ifdef MOTION_VECTOR_PREPASS
|
|
motion_vector,
|
|
#endif
|
|
#endif
|
|
);
|
|
}
|
|
|
|
fn normal_local_to_world(vertex_normal: vec3<f32>, instance_uniform: ptr<function, Mesh>) -> vec3<f32> {
|
|
if any(vertex_normal != vec3<f32>(0.0)) {
|
|
return normalize(
|
|
mat2x4_f32_to_mat3x3_unpack(
|
|
(*instance_uniform).local_from_world_transpose_a,
|
|
(*instance_uniform).local_from_world_transpose_b,
|
|
) * vertex_normal
|
|
);
|
|
} else {
|
|
return vertex_normal;
|
|
}
|
|
}
|
|
|
|
fn tangent_local_to_world(vertex_tangent: vec4<f32>, world_from_local: mat4x4<f32>, mesh_flags: u32) -> vec4<f32> {
|
|
if any(vertex_tangent != vec4<f32>(0.0)) {
|
|
return vec4<f32>(
|
|
normalize(
|
|
mat3x3<f32>(
|
|
world_from_local[0].xyz,
|
|
world_from_local[1].xyz,
|
|
world_from_local[2].xyz,
|
|
) * vertex_tangent.xyz
|
|
),
|
|
vertex_tangent.w * sign_determinant_model_3x3m(mesh_flags)
|
|
);
|
|
} else {
|
|
return vertex_tangent;
|
|
}
|
|
}
|
|
#endif
|