diff --git a/Cargo.toml b/Cargo.toml index 4112429b3e..0a0ea25aab 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1075,7 +1075,7 @@ setup = [ "curl", "-o", "assets/models/bunny.meshlet_mesh", - "https://raw.githubusercontent.com/JMS55/bevy_meshlet_asset/b6c712cfc87c65de419f856845401aba336a7bcd/bunny.meshlet_mesh", + "https://raw.githubusercontent.com/JMS55/bevy_meshlet_asset/e3da1533b4c69fb967f233c817e9b0921134d317/bunny.meshlet_mesh", ], ] diff --git a/crates/bevy_pbr/src/meshlet/from_mesh.rs b/crates/bevy_pbr/src/meshlet/from_mesh.rs index 047b1a492e..660a5bdeea 100644 --- a/crates/bevy_pbr/src/meshlet/from_mesh.rs +++ b/crates/bevy_pbr/src/meshlet/from_mesh.rs @@ -7,7 +7,7 @@ use bevy_utils::HashMap; use itertools::Itertools; use meshopt::{ build_meshlets, compute_cluster_bounds, compute_meshlet_bounds, ffi::meshopt_Bounds, simplify, - simplify_scale, Meshlets, SimplifyOptions, VertexDataAdapter, + Meshlets, SimplifyOptions, VertexDataAdapter, }; use metis::Graph; use smallvec::SmallVec; @@ -49,11 +49,9 @@ impl MeshletMesh { }, }) .collect::>(); - let mesh_scale = simplify_scale(&vertices); // Build further LODs let mut simplification_queue = 0..meshlets.len(); - let mut lod_level = 1; while simplification_queue.len() > 1 { // For each meshlet build a list of connected meshlets (meshlets that share a triangle edge) let connected_meshlets_per_meshlet = @@ -70,19 +68,14 @@ impl MeshletMesh { for group_meshlets in groups.into_iter().filter(|group| group.len() > 1) { // Simplify the group to ~50% triangle count - let Some((simplified_group_indices, mut group_error)) = simplify_meshlet_groups( - &group_meshlets, - &meshlets, - &vertices, - lod_level, - mesh_scale, - ) else { + let Some((simplified_group_indices, mut group_error)) = + simplify_meshlet_group(&group_meshlets, &meshlets, &vertices) + else { continue; }; - // Add the maximum child error to the parent error to make parent error cumulative from LOD 0 - // (we're currently building the parent from its children) - group_error += group_meshlets.iter().fold(0.0f32, |acc, meshlet_id| { + // Force parent error to be >= child error (we're currently building the parent from its children) + group_error = group_meshlets.iter().fold(group_error, |acc, meshlet_id| { acc.max(bounding_spheres[*meshlet_id].self_lod.radius) }); @@ -99,7 +92,7 @@ impl MeshletMesh { } // Build new meshlets using the simplified group - let new_meshlets_count = split_simplified_groups_into_new_meshlets( + let new_meshlets_count = split_simplified_group_into_new_meshlets( &simplified_group_indices, &vertices, &mut meshlets, @@ -125,7 +118,6 @@ impl MeshletMesh { } simplification_queue = next_lod_start..meshlets.len(); - lod_level += 1; } // Convert meshopt_Meshlet data to a custom format @@ -172,7 +164,7 @@ fn validate_input_mesh(mesh: &Mesh) -> Result, MeshToMeshletMeshC } fn compute_meshlets(indices: &[u32], vertices: &VertexDataAdapter) -> Meshlets { - build_meshlets(indices, vertices, 64, 64, 0.0) + build_meshlets(indices, vertices, 255, 128, 0.0) // Meshoptimizer won't currently let us do 256 vertices } fn find_connected_meshlets( @@ -252,7 +244,7 @@ fn group_meshlets( xadj.push(adjncy.len() as i32); let mut group_per_meshlet = vec![0; simplification_queue.len()]; - let partition_count = simplification_queue.len().div_ceil(4); + let partition_count = simplification_queue.len().div_ceil(4); // TODO: Nanite uses groups of 8-32, probably based on some kind of heuristic Graph::new(1, partition_count as i32, &xadj, &adjncy) .unwrap() .set_adjwgt(&adjwgt) @@ -267,12 +259,10 @@ fn group_meshlets( groups } -fn simplify_meshlet_groups( +fn simplify_meshlet_group( group_meshlets: &[usize], meshlets: &Meshlets, vertices: &VertexDataAdapter<'_>, - lod_level: u32, - mesh_scale: f32, ) -> Option<(Vec, f32)> { // Build a new index buffer into the mesh vertex data by combining all meshlet data in the group let mut group_indices = Vec::new(); @@ -283,11 +273,6 @@ fn simplify_meshlet_groups( } } - // Allow more deformation for high LOD levels (1% at LOD 1, 10% at LOD 20+) - let t = (lod_level - 1) as f32 / 19.0; - let target_error_relative = 0.1 * t + 0.01 * (1.0 - t); - let target_error = target_error_relative * mesh_scale; - // Simplify the group to ~50% triangle count // TODO: Simplify using vertex attributes let mut error = 0.0; @@ -295,13 +280,13 @@ fn simplify_meshlet_groups( &group_indices, vertices, group_indices.len() / 2, - target_error, - SimplifyOptions::LockBorder | SimplifyOptions::Sparse | SimplifyOptions::ErrorAbsolute, + f32::MAX, + SimplifyOptions::LockBorder | SimplifyOptions::Sparse | SimplifyOptions::ErrorAbsolute, // TODO: Specify manual vertex locks instead of meshopt's overly-strict locks Some(&mut error), ); - // Check if we were able to simplify to at least 65% triangle count - if simplified_group_indices.len() as f32 / group_indices.len() as f32 > 0.65 { + // Check if we were able to simplify at least a little (95% of the original triangle count) + if simplified_group_indices.len() as f32 / group_indices.len() as f32 > 0.95 { return None; } @@ -311,7 +296,7 @@ fn simplify_meshlet_groups( Some((simplified_group_indices, error)) } -fn split_simplified_groups_into_new_meshlets( +fn split_simplified_group_into_new_meshlets( simplified_group_indices: &[u32], vertices: &VertexDataAdapter<'_>, meshlets: &mut Meshlets, diff --git a/crates/bevy_pbr/src/meshlet/resolve_render_targets.wgsl b/crates/bevy_pbr/src/meshlet/resolve_render_targets.wgsl index b54dbaec53..3c0cfcf943 100644 --- a/crates/bevy_pbr/src/meshlet/resolve_render_targets.wgsl +++ b/crates/bevy_pbr/src/meshlet/resolve_render_targets.wgsl @@ -31,7 +31,7 @@ fn resolve_material_depth(in: FullscreenVertexOutput) -> @builtin(frag_depth) f3 let depth = visibility >> 32u; if depth == 0lu { return 0.0; } - let cluster_id = u32(visibility) >> 6u; + let cluster_id = u32(visibility) >> 7u; let instance_id = meshlet_cluster_instance_ids[cluster_id]; let material_id = meshlet_instance_material_ids[instance_id]; return f32(material_id) / 65535.0; diff --git a/crates/bevy_pbr/src/meshlet/resource_manager.rs b/crates/bevy_pbr/src/meshlet/resource_manager.rs index 1d6ada2a2a..297987ef84 100644 --- a/crates/bevy_pbr/src/meshlet/resource_manager.rs +++ b/crates/bevy_pbr/src/meshlet/resource_manager.rs @@ -63,7 +63,7 @@ pub struct ResourceManager { impl ResourceManager { pub fn new(cluster_buffer_slots: u32, render_device: &RenderDevice) -> Self { let needs_dispatch_remap = - cluster_buffer_slots < render_device.limits().max_compute_workgroups_per_dimension; + cluster_buffer_slots > render_device.limits().max_compute_workgroups_per_dimension; Self { visibility_buffer_raster_clusters: render_device.create_buffer(&BufferDescriptor { @@ -472,7 +472,7 @@ pub fn prepare_meshlet_per_frame_resources( .create_buffer_with_data(&BufferInitDescriptor { label: Some("meshlet_visibility_buffer_hardware_raster_indirect_args_first"), contents: DrawIndirectArgs { - vertex_count: 64 * 3, + vertex_count: 128 * 3, instance_count: 0, first_vertex: 0, first_instance: 0, @@ -484,7 +484,7 @@ pub fn prepare_meshlet_per_frame_resources( .create_buffer_with_data(&BufferInitDescriptor { label: Some("visibility_buffer_hardware_raster_indirect_args_second"), contents: DrawIndirectArgs { - vertex_count: 64 * 3, + vertex_count: 128 * 3, instance_count: 0, first_vertex: 0, first_instance: 0, diff --git a/crates/bevy_pbr/src/meshlet/visibility_buffer_hardware_raster.wgsl b/crates/bevy_pbr/src/meshlet/visibility_buffer_hardware_raster.wgsl index e3cf7a6fb3..eb0947f967 100644 --- a/crates/bevy_pbr/src/meshlet/visibility_buffer_hardware_raster.wgsl +++ b/crates/bevy_pbr/src/meshlet/visibility_buffer_hardware_raster.wgsl @@ -56,7 +56,7 @@ fn vertex(@builtin(instance_index) instance_index: u32, @builtin(vertex_index) v return VertexOutput( clip_position, #ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT - (cluster_id << 6u) | triangle_id, + (cluster_id << 7u) | triangle_id, #endif #ifdef DEPTH_CLAMP_ORTHO unclamped_clip_depth, @@ -83,7 +83,7 @@ fn fragment(vertex_output: VertexOutput) { fn dummy_vertex() -> VertexOutput { return VertexOutput( - vec4(0.0), + vec4(divide(0.0, 0.0)), // NaN vertex position #ifdef MESHLET_VISIBILITY_BUFFER_RASTER_PASS_OUTPUT 0u, #endif @@ -92,3 +92,8 @@ fn dummy_vertex() -> VertexOutput { #endif ); } + +// Naga doesn't allow divide by zero literals, but this lets us work around it +fn divide(a: f32, b: f32) -> f32 { + return a / b; +} diff --git a/crates/bevy_pbr/src/meshlet/visibility_buffer_resolve.wgsl b/crates/bevy_pbr/src/meshlet/visibility_buffer_resolve.wgsl index bb35c16497..365fb2e7c2 100644 --- a/crates/bevy_pbr/src/meshlet/visibility_buffer_resolve.wgsl +++ b/crates/bevy_pbr/src/meshlet/visibility_buffer_resolve.wgsl @@ -97,11 +97,11 @@ struct VertexOutput { fn resolve_vertex_output(frag_coord: vec4) -> VertexOutput { let frag_coord_1d = u32(frag_coord.y) * u32(view.viewport.z) + u32(frag_coord.x); let packed_ids = u32(meshlet_visibility_buffer[frag_coord_1d]); // TODO: Might be faster to load the correct u32 directly - let cluster_id = packed_ids >> 6u; + let cluster_id = packed_ids >> 7u; let meshlet_id = meshlet_cluster_meshlet_ids[cluster_id]; let meshlet = meshlets[meshlet_id]; - let triangle_id = extractBits(packed_ids, 0u, 6u); + let triangle_id = extractBits(packed_ids, 0u, 7u); let index_ids = meshlet.start_index_id + (triangle_id * 3u) + vec3(0u, 1u, 2u); let indices = meshlet.start_vertex_id + vec3(get_meshlet_index(index_ids.x), get_meshlet_index(index_ids.y), get_meshlet_index(index_ids.z)); let vertex_ids = vec3(meshlet_vertex_ids[indices.x], meshlet_vertex_ids[indices.y], meshlet_vertex_ids[indices.z]); diff --git a/crates/bevy_pbr/src/meshlet/visibility_buffer_software_raster.wgsl b/crates/bevy_pbr/src/meshlet/visibility_buffer_software_raster.wgsl index 02feaaeaed..853eb853b7 100644 --- a/crates/bevy_pbr/src/meshlet/visibility_buffer_software_raster.wgsl +++ b/crates/bevy_pbr/src/meshlet/visibility_buffer_software_raster.wgsl @@ -22,10 +22,10 @@ // TODO: Subpixel precision and top-left rule -var viewport_vertices: array; +var viewport_vertices: array; @compute -@workgroup_size(64, 1, 1) // 64 threads per workgroup, 1 vertex/triangle per thread, 1 cluster per workgroup +@workgroup_size(128, 1, 1) // 128 threads per workgroup, 1-2 vertices per thread, 1 triangle per thread, 1 cluster per workgroup fn rasterize_cluster( @builtin(workgroup_id) workgroup_id: vec3, @builtin(local_invocation_index) local_invocation_index: u32, @@ -44,28 +44,30 @@ fn rasterize_cluster( let meshlet_id = meshlet_cluster_meshlet_ids[cluster_id]; let meshlet = meshlets[meshlet_id]; - // Load and project 1 vertex per thread - let vertex_id = local_invocation_index; - if vertex_id < meshlet.vertex_count { - let meshlet_vertex_id = meshlet_vertex_ids[meshlet.start_vertex_id + vertex_id]; - let vertex = unpack_meshlet_vertex(meshlet_vertex_data[meshlet_vertex_id]); + let instance_id = meshlet_cluster_instance_ids[cluster_id]; + let instance_uniform = meshlet_instance_uniforms[instance_id]; + let world_from_local = affine3_to_square(instance_uniform.world_from_local); - // Project vertex to viewport space - let instance_id = meshlet_cluster_instance_ids[cluster_id]; - let instance_uniform = meshlet_instance_uniforms[instance_id]; - let world_from_local = affine3_to_square(instance_uniform.world_from_local); - let world_position = mesh_position_local_to_world(world_from_local, vec4(vertex.position, 1.0)); - var clip_position = view.clip_from_world * vec4(world_position.xyz, 1.0); - var ndc_position = clip_position.xyz / clip_position.w; + // Load and project 1 vertex per thread, and then again if there are more than 128 vertices in the meshlet + for (var i = 0u; i <= 128u; i += 128u) { + let vertex_id = local_invocation_index + i; + if vertex_id < meshlet.vertex_count { + let meshlet_vertex_id = meshlet_vertex_ids[meshlet.start_vertex_id + vertex_id]; + let vertex = unpack_meshlet_vertex(meshlet_vertex_data[meshlet_vertex_id]); + + // Project vertex to viewport space + let world_position = mesh_position_local_to_world(world_from_local, vec4(vertex.position, 1.0)); + let clip_position = view.clip_from_world * vec4(world_position.xyz, 1.0); + var ndc_position = clip_position.xyz / clip_position.w; #ifdef DEPTH_CLAMP_ORTHO - ndc_position.z = 1.0 / clip_position.z; + ndc_position.z = 1.0 / clip_position.z; #endif - let viewport_position_xy = ndc_to_uv(ndc_position.xy) * view.viewport.zw; + let viewport_position_xy = ndc_to_uv(ndc_position.xy) * view.viewport.zw; - // Write vertex to workgroup shared memory - viewport_vertices[vertex_id] = vec3(viewport_position_xy, ndc_position.z); + // Write vertex to workgroup shared memory + viewport_vertices[vertex_id] = vec3(viewport_position_xy, ndc_position.z); + } } - workgroupBarrier(); // Load 1 triangle's worth of vertex data per thread @@ -76,7 +78,7 @@ fn rasterize_cluster( let vertex_0 = viewport_vertices[vertex_ids[2]]; let vertex_1 = viewport_vertices[vertex_ids[1]]; let vertex_2 = viewport_vertices[vertex_ids[0]]; - let packed_ids = (cluster_id << 6u) | triangle_id; + let packed_ids = (cluster_id << 7u) | triangle_id; // Compute triangle bounding box let min_x = u32(min3(vertex_0.x, vertex_1.x, vertex_2.x)); diff --git a/examples/3d/meshlet.rs b/examples/3d/meshlet.rs index 20264b43a1..8cb503ba3a 100644 --- a/examples/3d/meshlet.rs +++ b/examples/3d/meshlet.rs @@ -17,7 +17,7 @@ use camera_controller::{CameraController, CameraControllerPlugin}; use std::{f32::consts::PI, path::Path, process::ExitCode}; const ASSET_URL: &str = - "https://raw.githubusercontent.com/JMS55/bevy_meshlet_asset/10bb5471c7beedfe63ad1cf269599c92b0f10aa2/bunny.meshlet_mesh"; + "https://raw.githubusercontent.com/JMS55/bevy_meshlet_asset/e3da1533b4c69fb967f233c817e9b0921134d317/bunny.meshlet_mesh"; fn main() -> ExitCode { if !Path::new("./assets/models/bunny.meshlet_mesh").exists() {