Add pipeline statistics (#9135)

# Objective

It's useful to have access to render pipeline statistics, since they
provide more information than FPS alone. For example, the number of
drawn triangles can be used to debug culling and LODs. The number of
fragment shader invocations can provide a more stable alternative metric
than GPU elapsed time.

See also: Render node GPU timing overlay #8067, which doesn't provide
pipeline statistics, but adds a nice overlay.

## Solution

Add `RenderDiagnosticsPlugin`, which enables collecting pipeline
statistics and CPU & GPU timings.

---

## Changelog

- Add `RenderDiagnosticsPlugin`
- Add `RenderContext::diagnostic_recorder` method

---------

Co-authored-by: Alice Cecile <alice.i.cecile@gmail.com>
This commit is contained in:
LeshaInc 2024-03-17 23:29:35 +03:00 committed by GitHub
parent 68f4f59ee6
commit 737b719dda
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 995 additions and 22 deletions

View file

@ -15,6 +15,7 @@ use bevy_ecs::{prelude::*, query::QueryItem};
use bevy_math::UVec2;
use bevy_render::{
camera::ExtractedCamera,
diagnostic::RecordDiagnostics,
extract_component::{
ComponentUniforms, DynamicUniformIndex, ExtractComponentPlugin, UniformComponentPlugin,
},
@ -156,6 +157,9 @@ impl ViewNode for BloomNode {
render_context.command_encoder().push_debug_group("bloom");
let diagnostics = render_context.diagnostic_recorder();
let time_span = diagnostics.time_span(render_context.command_encoder(), "bloom");
// First downsample pass
{
let downsampling_first_bind_group = render_context.render_device().create_bind_group(
@ -275,6 +279,7 @@ impl ViewNode for BloomNode {
upsampling_final_pass.draw(0..3, 0..1);
}
time_span.end(render_context.command_encoder());
render_context.command_encoder().pop_debug_group();
Ok(())

View file

@ -2,6 +2,7 @@ use crate::core_2d::Transparent2d;
use bevy_ecs::prelude::*;
use bevy_render::{
camera::ExtractedCamera,
diagnostic::RecordDiagnostics,
render_graph::{Node, NodeRunError, RenderGraphContext},
render_phase::RenderPhase,
render_resource::RenderPassDescriptor,
@ -52,6 +53,8 @@ impl Node for MainPass2dNode {
#[cfg(feature = "trace")]
let _main_pass_2d = info_span!("main_pass_2d").entered();
let diagnostics = render_context.diagnostic_recorder();
let mut render_pass = render_context.begin_tracked_render_pass(RenderPassDescriptor {
label: Some("main_pass_2d"),
color_attachments: &[Some(target.get_color_attachment())],
@ -60,11 +63,15 @@ impl Node for MainPass2dNode {
occlusion_query_set: None,
});
let pass_span = diagnostics.pass_span(&mut render_pass, "main_pass_2d");
if let Some(viewport) = camera.viewport.as_ref() {
render_pass.set_camera_viewport(viewport);
}
transparent_phase.render(&mut render_pass, world, view_entity);
pass_span.end(&mut render_pass);
}
// WebGL2 quirk: if ending with a render pass with a custom viewport, the viewport isn't

View file

@ -5,6 +5,7 @@ use crate::{
use bevy_ecs::{prelude::World, query::QueryItem};
use bevy_render::{
camera::ExtractedCamera,
diagnostic::RecordDiagnostics,
render_graph::{NodeRunError, RenderGraphContext, ViewNode},
render_phase::{RenderPhase, TrackedRenderPass},
render_resource::{CommandEncoderDescriptor, PipelineCache, RenderPassDescriptor, StoreOp},
@ -47,6 +48,8 @@ impl ViewNode for MainOpaquePass3dNode {
): QueryItem<'w, Self::ViewQuery>,
world: &'w World,
) -> Result<(), NodeRunError> {
let diagnostics = render_context.diagnostic_recorder();
let color_attachments = [Some(target.get_color_attachment())];
let depth_stencil_attachment = Some(depth.get_attachment(StoreOp::Store));
@ -70,6 +73,8 @@ impl ViewNode for MainOpaquePass3dNode {
occlusion_query_set: None,
});
let mut render_pass = TrackedRenderPass::new(&render_device, render_pass);
let pass_span = diagnostics.pass_span(&mut render_pass, "main_opaque_pass_3d");
if let Some(viewport) = camera.viewport.as_ref() {
render_pass.set_camera_viewport(viewport);
}
@ -104,6 +109,7 @@ impl ViewNode for MainOpaquePass3dNode {
}
}
pass_span.end(&mut render_pass);
drop(render_pass);
command_encoder.finish()
});

View file

@ -2,6 +2,7 @@ use crate::core_3d::Transparent3d;
use bevy_ecs::{prelude::*, query::QueryItem};
use bevy_render::{
camera::ExtractedCamera,
diagnostic::RecordDiagnostics,
render_graph::{NodeRunError, RenderGraphContext, ViewNode},
render_phase::RenderPhase,
render_resource::{RenderPassDescriptor, StoreOp},
@ -37,6 +38,8 @@ impl ViewNode for MainTransparentPass3dNode {
#[cfg(feature = "trace")]
let _main_transparent_pass_3d_span = info_span!("main_transparent_pass_3d").entered();
let diagnostics = render_context.diagnostic_recorder();
let mut render_pass = render_context.begin_tracked_render_pass(RenderPassDescriptor {
label: Some("main_transparent_pass_3d"),
color_attachments: &[Some(target.get_color_attachment())],
@ -51,11 +54,15 @@ impl ViewNode for MainTransparentPass3dNode {
occlusion_query_set: None,
});
let pass_span = diagnostics.pass_span(&mut render_pass, "main_transparent_pass_3d");
if let Some(viewport) = camera.viewport.as_ref() {
render_pass.set_camera_viewport(viewport);
}
transparent_phase.render(&mut render_pass, world, view_entity);
pass_span.end(&mut render_pass);
}
// WebGL2 quirk: if ending with a render pass with a custom viewport, the viewport isn't

View file

@ -2,6 +2,7 @@ use bevy_ecs::prelude::*;
use bevy_ecs::query::QueryItem;
use bevy_render::{
camera::ExtractedCamera,
diagnostic::RecordDiagnostics,
render_graph::{NodeRunError, RenderGraphContext, ViewNode},
render_phase::{RenderPhase, TrackedRenderPass},
render_resource::{CommandEncoderDescriptor, RenderPassDescriptor, StoreOp},
@ -43,6 +44,8 @@ impl ViewNode for PrepassNode {
): QueryItem<'w, Self::ViewQuery>,
world: &'w World,
) -> Result<(), NodeRunError> {
let diagnostics = render_context.diagnostic_recorder();
let mut color_attachments = vec![
view_prepass_textures
.normal
@ -83,7 +86,10 @@ impl ViewNode for PrepassNode {
timestamp_writes: None,
occlusion_query_set: None,
});
let mut render_pass = TrackedRenderPass::new(&render_device, render_pass);
let pass_span = diagnostics.pass_span(&mut render_pass, "prepass");
if let Some(viewport) = camera.viewport.as_ref() {
render_pass.set_camera_viewport(viewport);
}
@ -102,6 +108,7 @@ impl ViewNode for PrepassNode {
alpha_mask_prepass_phase.render(&mut render_pass, world, view_entity);
}
pass_span.end(&mut render_pass);
drop(render_pass);
// Copy prepass depth to the main depth texture if deferred isn't going to

View file

@ -4,6 +4,7 @@ use bevy_ecs::prelude::*;
use bevy_math::{Mat4, UVec3, UVec4, Vec2, Vec3, Vec3Swizzles, Vec4, Vec4Swizzles};
use bevy_render::{
camera::Camera,
diagnostic::RecordDiagnostics,
mesh::Mesh,
primitives::{CascadesFrusta, CubemapFrusta, Frustum, HalfSpace},
render_asset::RenderAssets,
@ -1809,6 +1810,9 @@ impl Node for ShadowPassNode {
render_context: &mut RenderContext<'w>,
world: &'w World,
) -> Result<(), NodeRunError> {
let diagnostics = render_context.diagnostic_recorder();
let time_span = diagnostics.time_span(render_context.command_encoder(), "shadows");
let view_entity = graph.view_entity();
if let Ok(view_lights) = self.main_view_query.get_manual(world, view_entity) {
for view_light_entity in view_lights.lights.iter().copied() {
@ -1820,6 +1824,7 @@ impl Node for ShadowPassNode {
let depth_stencil_attachment =
Some(view_light.depth_attachment.get_attachment(StoreOp::Store));
let diagnostics = render_context.diagnostic_recorder();
render_context.add_command_buffer_generation_task(move |render_device| {
#[cfg(feature = "trace")]
let _shadow_pass_span = info_span!("shadow_pass").entered();
@ -1836,16 +1841,22 @@ impl Node for ShadowPassNode {
timestamp_writes: None,
occlusion_query_set: None,
});
let mut render_pass = TrackedRenderPass::new(&render_device, render_pass);
let pass_span =
diagnostics.pass_span(&mut render_pass, view_light.pass_name.clone());
shadow_phase.render(&mut render_pass, world, view_light_entity);
pass_span.end(&mut render_pass);
drop(render_pass);
command_encoder.finish()
});
}
}
time_span.end(render_context.command_encoder());
Ok(())
}
}

View file

@ -42,6 +42,7 @@ bevy_asset = { path = "../bevy_asset", version = "0.14.0-dev" }
bevy_color = { path = "../bevy_color", version = "0.14.0-dev" }
bevy_core = { path = "../bevy_core", version = "0.14.0-dev" }
bevy_derive = { path = "../bevy_derive", version = "0.14.0-dev" }
bevy_diagnostic = { path = "../bevy_diagnostic", version = "0.14.0-dev" }
bevy_ecs = { path = "../bevy_ecs", version = "0.14.0-dev" }
bevy_encase_derive = { path = "../bevy_encase_derive", version = "0.14.0-dev" }
bevy_hierarchy = { path = "../bevy_hierarchy", version = "0.14.0-dev" }

View file

@ -0,0 +1,650 @@
use std::{
borrow::Cow,
ops::{DerefMut, Range},
sync::{
atomic::{AtomicBool, Ordering},
Arc,
},
thread::{self, ThreadId},
};
use bevy_diagnostic::{Diagnostic, DiagnosticMeasurement, DiagnosticPath, DiagnosticsStore};
use bevy_ecs::system::{Res, ResMut, Resource};
use bevy_utils::{tracing, Instant};
use std::sync::Mutex;
use wgpu::{
Buffer, BufferDescriptor, BufferUsages, CommandEncoder, ComputePass, Features, MapMode,
PipelineStatisticsTypes, QuerySet, QuerySetDescriptor, QueryType, Queue, RenderPass,
};
use crate::renderer::RenderDevice;
use super::RecordDiagnostics;
// buffer offset must be divisible by 256, so this constant must be divisible by 32 (=256/8)
const MAX_TIMESTAMP_QUERIES: u32 = 256;
const MAX_PIPELINE_STATISTICS: u32 = 128;
const TIMESTAMP_SIZE: u64 = 8;
const PIPELINE_STATISTICS_SIZE: u64 = 40;
/// Records diagnostics into [`QuerySet`]'s keeping track of the mapping between
/// spans and indices to the corresponding entries in the [`QuerySet`].
#[derive(Resource)]
pub struct DiagnosticsRecorder {
timestamp_period_ns: f32,
features: Features,
current_frame: Mutex<FrameData>,
submitted_frames: Vec<FrameData>,
finished_frames: Vec<FrameData>,
}
impl DiagnosticsRecorder {
/// Creates the new `DiagnosticsRecorder`.
pub fn new(device: &RenderDevice, queue: &Queue) -> DiagnosticsRecorder {
let features = device.features();
let timestamp_period_ns = if features.contains(Features::TIMESTAMP_QUERY) {
queue.get_timestamp_period()
} else {
0.0
};
DiagnosticsRecorder {
timestamp_period_ns,
features,
current_frame: Mutex::new(FrameData::new(device, features)),
submitted_frames: Vec::new(),
finished_frames: Vec::new(),
}
}
fn current_frame_mut(&mut self) -> &mut FrameData {
self.current_frame.get_mut().expect("lock poisoned")
}
fn current_frame_lock(&self) -> impl DerefMut<Target = FrameData> + '_ {
self.current_frame.lock().expect("lock poisoned")
}
/// Begins recording diagnostics for a new frame.
pub fn begin_frame(&mut self) {
let mut idx = 0;
while idx < self.submitted_frames.len() {
if self.submitted_frames[idx].run_mapped_callback(self.timestamp_period_ns) {
self.finished_frames
.push(self.submitted_frames.swap_remove(idx));
} else {
idx += 1;
}
}
self.current_frame_mut().begin();
}
/// Copies data from [`QuerySet`]'s to a [`Buffer`], after which it can be downloaded to CPU.
///
/// Should be called before [`DiagnosticsRecorder::finish_frame`]
pub fn resolve(&mut self, encoder: &mut CommandEncoder) {
self.current_frame_mut().resolve(encoder);
}
/// Finishes recording diagnostics for the current frame.
///
/// The specified `callback` will be invoked when diagnostics become available.
///
/// Should be called after [`DiagnosticsRecorder::resolve`],
/// and **after** all commands buffers have been queued.
pub fn finish_frame(
&mut self,
device: &RenderDevice,
callback: impl FnOnce(RenderDiagnostics) + Send + Sync + 'static,
) {
self.current_frame_mut().finish(callback);
// reuse one of the finished frames, if we can
let new_frame = match self.finished_frames.pop() {
Some(frame) => frame,
None => FrameData::new(device, self.features),
};
let old_frame = std::mem::replace(self.current_frame_mut(), new_frame);
self.submitted_frames.push(old_frame);
}
}
impl RecordDiagnostics for DiagnosticsRecorder {
fn begin_time_span<E: WriteTimestamp>(&self, encoder: &mut E, span_name: Cow<'static, str>) {
self.current_frame_lock()
.begin_time_span(encoder, span_name);
}
fn end_time_span<E: WriteTimestamp>(&self, encoder: &mut E) {
self.current_frame_lock().end_time_span(encoder);
}
fn begin_pass_span<P: Pass>(&self, pass: &mut P, span_name: Cow<'static, str>) {
self.current_frame_lock().begin_pass(pass, span_name);
}
fn end_pass_span<P: Pass>(&self, pass: &mut P) {
self.current_frame_lock().end_pass(pass);
}
}
struct SpanRecord {
thread_id: ThreadId,
path_range: Range<usize>,
pass_kind: Option<PassKind>,
begin_timestamp_index: Option<u32>,
end_timestamp_index: Option<u32>,
begin_instant: Option<Instant>,
end_instant: Option<Instant>,
pipeline_statistics_index: Option<u32>,
}
struct FrameData {
timestamps_query_set: Option<QuerySet>,
num_timestamps: u32,
supports_timestamps_inside_passes: bool,
pipeline_statistics_query_set: Option<QuerySet>,
num_pipeline_statistics: u32,
buffer_size: u64,
pipeline_statistics_buffer_offset: u64,
resolve_buffer: Option<Buffer>,
read_buffer: Option<Buffer>,
path_components: Vec<Cow<'static, str>>,
open_spans: Vec<SpanRecord>,
closed_spans: Vec<SpanRecord>,
is_mapped: Arc<AtomicBool>,
callback: Option<Box<dyn FnOnce(RenderDiagnostics) + Send + Sync + 'static>>,
}
impl FrameData {
fn new(device: &RenderDevice, features: Features) -> FrameData {
let wgpu_device = device.wgpu_device();
let mut buffer_size = 0;
let timestamps_query_set = if features.contains(Features::TIMESTAMP_QUERY) {
buffer_size += u64::from(MAX_TIMESTAMP_QUERIES) * TIMESTAMP_SIZE;
Some(wgpu_device.create_query_set(&QuerySetDescriptor {
label: Some("timestamps_query_set"),
ty: QueryType::Timestamp,
count: MAX_TIMESTAMP_QUERIES,
}))
} else {
None
};
let pipeline_statistics_buffer_offset = buffer_size;
let pipeline_statistics_query_set =
if features.contains(Features::PIPELINE_STATISTICS_QUERY) {
buffer_size += u64::from(MAX_PIPELINE_STATISTICS) * PIPELINE_STATISTICS_SIZE;
Some(wgpu_device.create_query_set(&QuerySetDescriptor {
label: Some("pipeline_statistics_query_set"),
ty: QueryType::PipelineStatistics(PipelineStatisticsTypes::all()),
count: MAX_PIPELINE_STATISTICS,
}))
} else {
None
};
let (resolve_buffer, read_buffer) = if buffer_size > 0 {
let resolve_buffer = wgpu_device.create_buffer(&BufferDescriptor {
label: Some("render_statistics_resolve_buffer"),
size: buffer_size,
usage: BufferUsages::QUERY_RESOLVE | BufferUsages::COPY_SRC,
mapped_at_creation: false,
});
let read_buffer = wgpu_device.create_buffer(&BufferDescriptor {
label: Some("render_statistics_read_buffer"),
size: buffer_size,
usage: BufferUsages::COPY_DST | BufferUsages::MAP_READ,
mapped_at_creation: false,
});
(Some(resolve_buffer), Some(read_buffer))
} else {
(None, None)
};
FrameData {
timestamps_query_set,
num_timestamps: 0,
supports_timestamps_inside_passes: features
.contains(Features::TIMESTAMP_QUERY_INSIDE_PASSES),
pipeline_statistics_query_set,
num_pipeline_statistics: 0,
buffer_size,
pipeline_statistics_buffer_offset,
resolve_buffer,
read_buffer,
path_components: Vec::new(),
open_spans: Vec::new(),
closed_spans: Vec::new(),
is_mapped: Arc::new(AtomicBool::new(false)),
callback: None,
}
}
fn begin(&mut self) {
self.num_timestamps = 0;
self.num_pipeline_statistics = 0;
self.path_components.clear();
self.open_spans.clear();
self.closed_spans.clear();
}
fn write_timestamp(
&mut self,
encoder: &mut impl WriteTimestamp,
is_inside_pass: bool,
) -> Option<u32> {
if is_inside_pass && !self.supports_timestamps_inside_passes {
return None;
}
if self.num_timestamps >= MAX_TIMESTAMP_QUERIES {
return None;
}
let set = self.timestamps_query_set.as_ref()?;
let index = self.num_timestamps;
encoder.write_timestamp(set, index);
self.num_timestamps += 1;
Some(index)
}
fn write_pipeline_statistics(
&mut self,
encoder: &mut impl WritePipelineStatistics,
) -> Option<u32> {
if self.num_pipeline_statistics >= MAX_PIPELINE_STATISTICS {
return None;
}
let set = self.pipeline_statistics_query_set.as_ref()?;
let index = self.num_pipeline_statistics;
encoder.begin_pipeline_statistics_query(set, index);
self.num_pipeline_statistics += 1;
Some(index)
}
fn open_span(
&mut self,
pass_kind: Option<PassKind>,
name: Cow<'static, str>,
) -> &mut SpanRecord {
let thread_id = thread::current().id();
let parent = self
.open_spans
.iter()
.filter(|v| v.thread_id == thread_id)
.last();
let path_range = match &parent {
Some(parent) if parent.path_range.end == self.path_components.len() => {
parent.path_range.start..parent.path_range.end + 1
}
Some(parent) => {
self.path_components
.extend_from_within(parent.path_range.clone());
self.path_components.len() - parent.path_range.len()..self.path_components.len() + 1
}
None => self.path_components.len()..self.path_components.len() + 1,
};
self.path_components.push(name);
self.open_spans.push(SpanRecord {
thread_id,
path_range,
pass_kind,
begin_timestamp_index: None,
end_timestamp_index: None,
begin_instant: None,
end_instant: None,
pipeline_statistics_index: None,
});
self.open_spans.last_mut().unwrap()
}
fn close_span(&mut self) -> &mut SpanRecord {
let thread_id = thread::current().id();
let iter = self.open_spans.iter();
let (index, _) = iter
.enumerate()
.filter(|(_, v)| v.thread_id == thread_id)
.last()
.unwrap();
let span = self.open_spans.swap_remove(index);
self.closed_spans.push(span);
self.closed_spans.last_mut().unwrap()
}
fn begin_time_span(&mut self, encoder: &mut impl WriteTimestamp, name: Cow<'static, str>) {
let begin_instant = Instant::now();
let begin_timestamp_index = self.write_timestamp(encoder, false);
let span = self.open_span(None, name);
span.begin_instant = Some(begin_instant);
span.begin_timestamp_index = begin_timestamp_index;
}
fn end_time_span(&mut self, encoder: &mut impl WriteTimestamp) {
let end_timestamp_index = self.write_timestamp(encoder, false);
let span = self.close_span();
span.end_timestamp_index = end_timestamp_index;
span.end_instant = Some(Instant::now());
}
fn begin_pass<P: Pass>(&mut self, pass: &mut P, name: Cow<'static, str>) {
let begin_instant = Instant::now();
let begin_timestamp_index = self.write_timestamp(pass, true);
let pipeline_statistics_index = self.write_pipeline_statistics(pass);
let span = self.open_span(Some(P::KIND), name);
span.begin_instant = Some(begin_instant);
span.begin_timestamp_index = begin_timestamp_index;
span.pipeline_statistics_index = pipeline_statistics_index;
}
fn end_pass(&mut self, pass: &mut impl Pass) {
let end_timestamp_index = self.write_timestamp(pass, true);
let span = self.close_span();
span.end_timestamp_index = end_timestamp_index;
if span.pipeline_statistics_index.is_some() {
pass.end_pipeline_statistics_query();
}
span.end_instant = Some(Instant::now());
}
fn resolve(&mut self, encoder: &mut CommandEncoder) {
let Some(resolve_buffer) = &self.resolve_buffer else {
return;
};
match &self.timestamps_query_set {
Some(set) if self.num_timestamps > 0 => {
encoder.resolve_query_set(set, 0..self.num_timestamps, resolve_buffer, 0);
}
_ => {}
}
match &self.pipeline_statistics_query_set {
Some(set) if self.num_pipeline_statistics > 0 => {
encoder.resolve_query_set(
set,
0..self.num_pipeline_statistics,
resolve_buffer,
self.pipeline_statistics_buffer_offset,
);
}
_ => {}
}
let Some(read_buffer) = &self.read_buffer else {
return;
};
encoder.copy_buffer_to_buffer(resolve_buffer, 0, read_buffer, 0, self.buffer_size);
}
fn diagnostic_path(&self, range: &Range<usize>, field: &str) -> DiagnosticPath {
DiagnosticPath::from_components(
std::iter::once("render")
.chain(self.path_components[range.clone()].iter().map(|v| &**v))
.chain(std::iter::once(field)),
)
}
fn finish(&mut self, callback: impl FnOnce(RenderDiagnostics) + Send + Sync + 'static) {
let Some(read_buffer) = &self.read_buffer else {
// we still have cpu timings, so let's use them
let mut diagnostics = Vec::new();
for span in &self.closed_spans {
if let (Some(begin), Some(end)) = (span.begin_instant, span.end_instant) {
diagnostics.push(RenderDiagnostic {
path: self.diagnostic_path(&span.path_range, "elapsed_cpu"),
suffix: "ms",
value: (end - begin).as_secs_f64() * 1000.0,
});
}
}
callback(RenderDiagnostics(diagnostics));
return;
};
self.callback = Some(Box::new(callback));
let is_mapped = self.is_mapped.clone();
read_buffer.slice(..).map_async(MapMode::Read, move |res| {
if let Err(e) = res {
tracing::warn!("Failed to download render statistics buffer: {e}");
return;
}
is_mapped.store(true, Ordering::Release);
});
}
// returns true if the frame is considered finished, false otherwise
fn run_mapped_callback(&mut self, timestamp_period_ns: f32) -> bool {
let Some(read_buffer) = &self.read_buffer else {
return true;
};
if !self.is_mapped.load(Ordering::Acquire) {
// need to wait more
return false;
}
let Some(callback) = self.callback.take() else {
return true;
};
let data = read_buffer.slice(..).get_mapped_range();
let timestamps = data[..(self.num_timestamps * 8) as usize]
.chunks(8)
.map(|v| u64::from_ne_bytes(v.try_into().unwrap()))
.collect::<Vec<u64>>();
let start = self.pipeline_statistics_buffer_offset as usize;
let len = (self.num_pipeline_statistics as usize) * 40;
let pipeline_statistics = data[start..start + len]
.chunks(8)
.map(|v| u64::from_ne_bytes(v.try_into().unwrap()))
.collect::<Vec<u64>>();
let mut diagnostics = Vec::new();
for span in &self.closed_spans {
if let (Some(begin), Some(end)) = (span.begin_instant, span.end_instant) {
diagnostics.push(RenderDiagnostic {
path: self.diagnostic_path(&span.path_range, "elapsed_cpu"),
suffix: "ms",
value: (end - begin).as_secs_f64() * 1000.0,
});
}
if let (Some(begin), Some(end)) = (span.begin_timestamp_index, span.end_timestamp_index)
{
let begin = timestamps[begin as usize] as f64;
let end = timestamps[end as usize] as f64;
let value = (end - begin) * (timestamp_period_ns as f64) / 1e6;
diagnostics.push(RenderDiagnostic {
path: self.diagnostic_path(&span.path_range, "elapsed_gpu"),
suffix: "ms",
value,
});
}
if let Some(index) = span.pipeline_statistics_index {
let index = (index as usize) * 5;
if span.pass_kind == Some(PassKind::Render) {
diagnostics.push(RenderDiagnostic {
path: self.diagnostic_path(&span.path_range, "vertex_shader_invocations"),
suffix: "",
value: pipeline_statistics[index] as f64,
});
diagnostics.push(RenderDiagnostic {
path: self.diagnostic_path(&span.path_range, "clipper_invocations"),
suffix: "",
value: pipeline_statistics[index + 1] as f64,
});
diagnostics.push(RenderDiagnostic {
path: self.diagnostic_path(&span.path_range, "clipper_primitives_out"),
suffix: "",
value: pipeline_statistics[index + 2] as f64,
});
diagnostics.push(RenderDiagnostic {
path: self.diagnostic_path(&span.path_range, "fragment_shader_invocations"),
suffix: "",
value: pipeline_statistics[index + 3] as f64,
});
}
if span.pass_kind == Some(PassKind::Compute) {
diagnostics.push(RenderDiagnostic {
path: self.diagnostic_path(&span.path_range, "compute_shader_invocations"),
suffix: "",
value: pipeline_statistics[index + 4] as f64,
});
}
}
}
callback(RenderDiagnostics(diagnostics));
drop(data);
read_buffer.unmap();
self.is_mapped.store(false, Ordering::Release);
true
}
}
/// Resource which stores render diagnostics of the most recent frame.
#[derive(Debug, Default, Clone, Resource)]
pub struct RenderDiagnostics(Vec<RenderDiagnostic>);
/// A render diagnostic which has been recorded, but not yet stored in [`DiagnosticsStore`].
#[derive(Debug, Clone, Resource)]
pub struct RenderDiagnostic {
pub path: DiagnosticPath,
pub suffix: &'static str,
pub value: f64,
}
/// Stores render diagnostics before they can be synced with the main app.
///
/// This mutex is locked twice per frame:
/// 1. in `PreUpdate`, during [`sync_diagnostics`],
/// 2. after rendering has finished and statistics have been downloaded from GPU.
#[derive(Debug, Default, Clone, Resource)]
pub struct RenderDiagnosticsMutex(pub(crate) Arc<Mutex<Option<RenderDiagnostics>>>);
/// Updates render diagnostics measurements.
pub fn sync_diagnostics(mutex: Res<RenderDiagnosticsMutex>, mut store: ResMut<DiagnosticsStore>) {
let Some(diagnostics) = mutex.0.lock().ok().and_then(|mut v| v.take()) else {
return;
};
let time = Instant::now();
for diagnostic in &diagnostics.0 {
if store.get(&diagnostic.path).is_none() {
store.add(Diagnostic::new(diagnostic.path.clone()).with_suffix(diagnostic.suffix));
}
store
.get_mut(&diagnostic.path)
.unwrap()
.add_measurement(DiagnosticMeasurement {
time,
value: diagnostic.value,
});
}
}
pub trait WriteTimestamp {
fn write_timestamp(&mut self, query_set: &QuerySet, index: u32);
}
impl WriteTimestamp for CommandEncoder {
fn write_timestamp(&mut self, query_set: &QuerySet, index: u32) {
CommandEncoder::write_timestamp(self, query_set, index);
}
}
impl WriteTimestamp for RenderPass<'_> {
fn write_timestamp(&mut self, query_set: &QuerySet, index: u32) {
RenderPass::write_timestamp(self, query_set, index);
}
}
impl WriteTimestamp for ComputePass<'_> {
fn write_timestamp(&mut self, query_set: &QuerySet, index: u32) {
ComputePass::write_timestamp(self, query_set, index);
}
}
pub trait WritePipelineStatistics {
fn begin_pipeline_statistics_query(&mut self, query_set: &QuerySet, index: u32);
fn end_pipeline_statistics_query(&mut self);
}
impl WritePipelineStatistics for RenderPass<'_> {
fn begin_pipeline_statistics_query(&mut self, query_set: &QuerySet, index: u32) {
RenderPass::begin_pipeline_statistics_query(self, query_set, index);
}
fn end_pipeline_statistics_query(&mut self) {
RenderPass::end_pipeline_statistics_query(self);
}
}
impl WritePipelineStatistics for ComputePass<'_> {
fn begin_pipeline_statistics_query(&mut self, query_set: &QuerySet, index: u32) {
ComputePass::begin_pipeline_statistics_query(self, query_set, index);
}
fn end_pipeline_statistics_query(&mut self) {
ComputePass::end_pipeline_statistics_query(self);
}
}
pub trait Pass: WritePipelineStatistics + WriteTimestamp {
const KIND: PassKind;
}
impl Pass for RenderPass<'_> {
const KIND: PassKind = PassKind::Render;
}
impl Pass for ComputePass<'_> {
const KIND: PassKind = PassKind::Compute;
}
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
pub enum PassKind {
Render,
Compute,
}

View file

@ -0,0 +1,185 @@
//! Infrastructure for recording render diagnostics.
//!
//! For more info, see [`RenderDiagnosticsPlugin`].
pub(crate) mod internal;
use std::{borrow::Cow, marker::PhantomData, sync::Arc};
use bevy_app::{App, Plugin, PreUpdate};
use crate::RenderApp;
use self::internal::{
sync_diagnostics, DiagnosticsRecorder, Pass, RenderDiagnosticsMutex, WriteTimestamp,
};
use super::{RenderDevice, RenderQueue};
/// Enables collecting render diagnostics, such as CPU/GPU elapsed time per render pass,
/// as well as pipeline statistics (number of primitives, number of shader invocations, etc).
///
/// To access the diagnostics, you can use [`DiagnosticsStore`](bevy_diagnostic::DiagnosticsStore) resource,
/// or add [`LogDiagnosticsPlugin`](bevy_diagnostic::LogDiagnosticsPlugin).
///
/// To record diagnostics in your own passes:
/// 1. First, obtain the diagnostic recorder using [`RenderContext::diagnostic_recorder`](crate::renderer::RenderContext::diagnostic_recorder).
///
/// It won't do anything unless [`RenderDiagnosticsPlugin`] is present,
/// so you're free to omit `#[cfg]` clauses.
/// ```ignore
/// let diagnostics = render_context.diagnostic_recorder();
/// ```
/// 2. Begin the span inside a command encoder, or a render/compute pass encoder.
/// ```ignore
/// let time_span = diagnostics.time_span(render_context.command_encoder(), "shadows");
/// ```
/// 3. End the span, providing the same encoder.
/// ```ignore
/// time_span.end(render_context.command_encoder());
/// ```
///
/// # Supported platforms
/// Timestamp queries and pipeline statistics are currently supported only on Vulkan and DX12.
/// On other platforms (Metal, WebGPU, WebGL2) only CPU time will be recorded.
#[allow(clippy::doc_markdown)]
#[derive(Default)]
pub struct RenderDiagnosticsPlugin;
impl Plugin for RenderDiagnosticsPlugin {
fn build(&self, app: &mut App) {
let render_diagnostics_mutex = RenderDiagnosticsMutex::default();
app.insert_resource(render_diagnostics_mutex.clone())
.add_systems(PreUpdate, sync_diagnostics);
if let Ok(render_app) = app.get_sub_app_mut(RenderApp) {
render_app.insert_resource(render_diagnostics_mutex);
}
}
fn finish(&self, app: &mut App) {
let Ok(render_app) = app.get_sub_app_mut(RenderApp) else {
return;
};
let device = render_app.world.resource::<RenderDevice>();
let queue = render_app.world.resource::<RenderQueue>();
render_app.insert_resource(DiagnosticsRecorder::new(device, queue));
}
}
/// Allows recording diagnostic spans.
pub trait RecordDiagnostics: Send + Sync {
/// Begin a time span, which will record elapsed CPU and GPU time.
///
/// Returns a guard, which will panic on drop unless you end the span.
fn time_span<E, N>(&self, encoder: &mut E, name: N) -> TimeSpanGuard<'_, Self, E>
where
E: WriteTimestamp,
N: Into<Cow<'static, str>>,
{
self.begin_time_span(encoder, name.into());
TimeSpanGuard {
recorder: self,
marker: PhantomData,
}
}
/// Begin a pass span, which will record elapsed CPU and GPU time,
/// as well as pipeline statistics on supported platforms.
///
/// Returns a guard, which will panic on drop unless you end the span.
fn pass_span<P, N>(&self, pass: &mut P, name: N) -> PassSpanGuard<'_, Self, P>
where
P: Pass,
N: Into<Cow<'static, str>>,
{
self.begin_pass_span(pass, name.into());
PassSpanGuard {
recorder: self,
marker: PhantomData,
}
}
#[doc(hidden)]
fn begin_time_span<E: WriteTimestamp>(&self, encoder: &mut E, name: Cow<'static, str>);
#[doc(hidden)]
fn end_time_span<E: WriteTimestamp>(&self, encoder: &mut E);
#[doc(hidden)]
fn begin_pass_span<P: Pass>(&self, pass: &mut P, name: Cow<'static, str>);
#[doc(hidden)]
fn end_pass_span<P: Pass>(&self, pass: &mut P);
}
/// Guard returned by [`RecordDiagnostics::time_span`].
///
/// Will panic on drop unless [`TimeSpanGuard::end`] is called.
pub struct TimeSpanGuard<'a, R: ?Sized, E> {
recorder: &'a R,
marker: PhantomData<E>,
}
impl<R: RecordDiagnostics + ?Sized, E: WriteTimestamp> TimeSpanGuard<'_, R, E> {
/// End the span. You have to provide the same encoder which was used to begin the span.
pub fn end(self, encoder: &mut E) {
self.recorder.end_time_span(encoder);
std::mem::forget(self);
}
}
impl<R: ?Sized, E> Drop for TimeSpanGuard<'_, R, E> {
fn drop(&mut self) {
panic!("TimeSpanScope::end was never called")
}
}
/// Guard returned by [`RecordDiagnostics::pass_span`].
///
/// Will panic on drop unless [`PassSpanGuard::end`] is called.
pub struct PassSpanGuard<'a, R: ?Sized, P> {
recorder: &'a R,
marker: PhantomData<P>,
}
impl<R: RecordDiagnostics + ?Sized, P: Pass> PassSpanGuard<'_, R, P> {
/// End the span. You have to provide the same encoder which was used to begin the span.
pub fn end(self, pass: &mut P) {
self.recorder.end_pass_span(pass);
std::mem::forget(self);
}
}
impl<R: ?Sized, P> Drop for PassSpanGuard<'_, R, P> {
fn drop(&mut self) {
panic!("PassSpanScope::end was never called")
}
}
impl<T: RecordDiagnostics> RecordDiagnostics for Option<Arc<T>> {
fn begin_time_span<E: WriteTimestamp>(&self, encoder: &mut E, name: Cow<'static, str>) {
if let Some(recorder) = &self {
recorder.begin_time_span(encoder, name);
}
}
fn end_time_span<E: WriteTimestamp>(&self, encoder: &mut E) {
if let Some(recorder) = &self {
recorder.end_time_span(encoder);
}
}
fn begin_pass_span<P: Pass>(&self, pass: &mut P, name: Cow<'static, str>) {
if let Some(recorder) = &self {
recorder.begin_pass_span(pass, name);
}
}
fn end_pass_span<P: Pass>(&self, pass: &mut P) {
if let Some(recorder) = &self {
recorder.end_pass_span(pass);
}
}
}

View file

@ -11,6 +11,7 @@ pub mod alpha;
pub mod batching;
pub mod camera;
pub mod deterministic;
pub mod diagnostic;
pub mod extract_component;
pub mod extract_instances;
mod extract_param;

View file

@ -1,5 +1,6 @@
use crate::{
camera::Viewport,
diagnostic::internal::{Pass, PassKind, WritePipelineStatistics, WriteTimestamp},
render_resource::{
BindGroup, BindGroupId, Buffer, BufferId, BufferSlice, RenderPipeline, RenderPipelineId,
ShaderStages,
@ -9,7 +10,7 @@ use crate::{
use bevy_color::LinearRgba;
use bevy_utils::{default, detailed_trace};
use std::ops::Range;
use wgpu::{IndexFormat, RenderPass};
use wgpu::{IndexFormat, QuerySet, RenderPass};
/// Tracks the state of a [`TrackedRenderPass`].
///
@ -179,7 +180,7 @@ impl<'a> TrackedRenderPass<'a> {
/// Assign a vertex buffer to a slot.
///
/// Subsequent calls to [`draw`] and [`draw_indexed`] on this
/// [`RenderPass`] will use `buffer` as one of the source vertex buffers.
/// [`TrackedRenderPass`] will use `buffer` as one of the source vertex buffers.
///
/// The `slot_index` refers to the index of the matching descriptor in
/// [`VertexState::buffers`](crate::render_resource::VertexState::buffers).
@ -603,3 +604,23 @@ impl<'a> TrackedRenderPass<'a> {
self.pass.set_blend_constant(wgpu::Color::from(color));
}
}
impl WriteTimestamp for TrackedRenderPass<'_> {
fn write_timestamp(&mut self, query_set: &wgpu::QuerySet, index: u32) {
self.pass.write_timestamp(query_set, index);
}
}
impl WritePipelineStatistics for TrackedRenderPass<'_> {
fn begin_pipeline_statistics_query(&mut self, query_set: &QuerySet, index: u32) {
self.pass.begin_pipeline_statistics_query(query_set, index);
}
fn end_pipeline_statistics_query(&mut self) {
self.pass.end_pipeline_statistics_query();
}
}
impl Pass for TrackedRenderPass<'_> {
const KIND: PassKind = PassKind::Render;
}

View file

@ -8,6 +8,7 @@ use std::{borrow::Cow, collections::VecDeque};
use thiserror::Error;
use crate::{
diagnostic::internal::{DiagnosticsRecorder, RenderDiagnosticsMutex},
render_graph::{
Edge, InternedRenderLabel, InternedRenderSubGraph, NodeRunError, NodeState, RenderGraph,
RenderGraphContext, SlotLabel, SlotType, SlotValue,
@ -54,21 +55,39 @@ impl RenderGraphRunner {
pub fn run(
graph: &RenderGraph,
render_device: RenderDevice,
mut diagnostics_recorder: Option<DiagnosticsRecorder>,
queue: &wgpu::Queue,
adapter: &wgpu::Adapter,
world: &World,
finalizer: impl FnOnce(&mut wgpu::CommandEncoder),
) -> Result<(), RenderGraphRunnerError> {
let mut render_context = RenderContext::new(render_device, adapter.get_info());
) -> Result<Option<DiagnosticsRecorder>, RenderGraphRunnerError> {
if let Some(recorder) = &mut diagnostics_recorder {
recorder.begin_frame();
}
let mut render_context =
RenderContext::new(render_device, adapter.get_info(), diagnostics_recorder);
Self::run_graph(graph, None, &mut render_context, world, &[], None)?;
finalizer(render_context.command_encoder());
{
let (render_device, mut diagnostics_recorder) = {
#[cfg(feature = "trace")]
let _span = info_span!("submit_graph_commands").entered();
queue.submit(render_context.finish());
let (commands, render_device, diagnostics_recorder) = render_context.finish();
queue.submit(commands);
(render_device, diagnostics_recorder)
};
if let Some(recorder) = &mut diagnostics_recorder {
let render_diagnostics_mutex = world.resource::<RenderDiagnosticsMutex>().0.clone();
recorder.finish_frame(&render_device, move |diagnostics| {
*render_diagnostics_mutex.lock().expect("lock poisoned") = Some(diagnostics);
});
}
Ok(())
Ok(diagnostics_recorder)
}
fn run_graph<'w>(

View file

@ -8,6 +8,7 @@ pub use graph_runner::*;
pub use render_device::*;
use crate::{
diagnostic::{internal::DiagnosticsRecorder, RecordDiagnostics},
render_graph::RenderGraph,
render_phase::TrackedRenderPass,
render_resource::RenderPassDescriptor,
@ -27,34 +28,46 @@ pub fn render_system(world: &mut World, state: &mut SystemState<Query<Entity, Wi
world.resource_scope(|world, mut graph: Mut<RenderGraph>| {
graph.update(world);
});
let diagnostics_recorder = world.remove_resource::<DiagnosticsRecorder>();
let graph = world.resource::<RenderGraph>();
let render_device = world.resource::<RenderDevice>();
let render_queue = world.resource::<RenderQueue>();
let render_adapter = world.resource::<RenderAdapter>();
if let Err(e) = RenderGraphRunner::run(
let res = RenderGraphRunner::run(
graph,
render_device.clone(), // TODO: is this clone really necessary?
diagnostics_recorder,
&render_queue.0,
&render_adapter.0,
world,
|encoder| {
crate::view::screenshot::submit_screenshot_commands(world, encoder);
},
) {
error!("Error running render graph:");
{
let mut src: &dyn std::error::Error = &e;
loop {
error!("> {}", src);
match src.source() {
Some(s) => src = s,
None => break,
);
match res {
Ok(Some(diagnostics_recorder)) => {
world.insert_resource(diagnostics_recorder);
}
Ok(None) => {}
Err(e) => {
error!("Error running render graph:");
{
let mut src: &dyn std::error::Error = &e;
loop {
error!("> {}", src);
match src.source() {
Some(s) => src = s,
None => break,
}
}
}
}
panic!("Error running render graph: {e}");
panic!("Error running render graph: {e}");
}
}
{
@ -306,11 +319,16 @@ pub struct RenderContext<'w> {
command_encoder: Option<CommandEncoder>,
command_buffer_queue: Vec<QueuedCommandBuffer<'w>>,
force_serial: bool,
diagnostics_recorder: Option<Arc<DiagnosticsRecorder>>,
}
impl<'w> RenderContext<'w> {
/// Creates a new [`RenderContext`] from a [`RenderDevice`].
pub fn new(render_device: RenderDevice, adapter_info: AdapterInfo) -> Self {
pub fn new(
render_device: RenderDevice,
adapter_info: AdapterInfo,
diagnostics_recorder: Option<DiagnosticsRecorder>,
) -> Self {
// HACK: Parallel command encoding is currently bugged on AMD + Windows + Vulkan with wgpu 0.19.1
#[cfg(target_os = "windows")]
let force_serial =
@ -326,6 +344,7 @@ impl<'w> RenderContext<'w> {
command_encoder: None,
command_buffer_queue: Vec::new(),
force_serial,
diagnostics_recorder: diagnostics_recorder.map(Arc::new),
}
}
@ -334,6 +353,12 @@ impl<'w> RenderContext<'w> {
&self.render_device
}
/// Gets the diagnostics recorder, used to track elapsed time and pipeline statistics
/// of various render and compute passes.
pub fn diagnostic_recorder(&self) -> impl RecordDiagnostics {
self.diagnostics_recorder.clone()
}
/// Gets the current [`CommandEncoder`].
pub fn command_encoder(&mut self) -> &mut CommandEncoder {
self.command_encoder.get_or_insert_with(|| {
@ -353,6 +378,7 @@ impl<'w> RenderContext<'w> {
self.render_device
.create_command_encoder(&wgpu::CommandEncoderDescriptor::default())
});
let render_pass = command_encoder.begin_render_pass(&descriptor);
TrackedRenderPass::new(&self.render_device, render_pass)
}
@ -389,7 +415,13 @@ impl<'w> RenderContext<'w> {
///
/// This function will wait until all command buffer generation tasks are complete
/// by running them in parallel (where supported).
pub fn finish(mut self) -> Vec<CommandBuffer> {
pub fn finish(
mut self,
) -> (
Vec<CommandBuffer>,
RenderDevice,
Option<DiagnosticsRecorder>,
) {
self.flush_encoder();
let mut command_buffers = Vec::with_capacity(self.command_buffer_queue.len());
@ -413,9 +445,30 @@ impl<'w> RenderContext<'w> {
}
}
});
command_buffers.append(&mut task_based_command_buffers);
command_buffers.sort_unstable_by_key(|(i, _)| *i);
command_buffers.into_iter().map(|(_, cb)| cb).collect()
let mut command_buffers = command_buffers
.into_iter()
.map(|(_, cb)| cb)
.collect::<Vec<CommandBuffer>>();
let mut diagnostics_recorder = self.diagnostics_recorder.take().map(|v| {
Arc::try_unwrap(v)
.ok()
.expect("diagnostic recorder shouldn't be held longer than necessary")
});
if let Some(recorder) = &mut diagnostics_recorder {
let mut command_encoder = self
.render_device
.create_command_encoder(&wgpu::CommandEncoderDescriptor::default());
recorder.resolve(&mut command_encoder);
command_buffers.push(command_encoder.finish());
}
(command_buffers, self.render_device, diagnostics_recorder)
}
fn flush_encoder(&mut self) {