diff --git a/Cargo.toml b/Cargo.toml index a2915c54f6..72305990c3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2125,6 +2125,17 @@ description = "A compute shader that simulates Conway's Game of Life" category = "Shaders" wasm = false +[[example]] +name = "gpu_readback" +path = "examples/shader/gpu_readback.rs" +doc-scrape-examples = true + +[package.metadata.example.gpu_readback] +name = "GPU readback" +description = "A very simple compute shader that writes to a buffer that is read by the cpu" +category = "Shaders" +wasm = false + [[example]] name = "array_texture" path = "examples/shader/array_texture.rs" diff --git a/assets/shaders/gpu_readback.wgsl b/assets/shaders/gpu_readback.wgsl new file mode 100644 index 0000000000..4b91c9bcd7 --- /dev/null +++ b/assets/shaders/gpu_readback.wgsl @@ -0,0 +1,12 @@ +// This shader is used for the gpu_readback example +// The actual work it does is not important for the example + +// This is the data that lives in the gpu only buffer +@group(0) @binding(0) var data: array; + +@compute @workgroup_size(1) +fn main(@builtin(global_invocation_id) global_id: vec3) { + // We use the global_id to index the array to make sure we don't + // access data used in another workgroup + data[global_id.x] += 1u; +} diff --git a/examples/README.md b/examples/README.md index 09f79d6524..aef42f6987 100644 --- a/examples/README.md +++ b/examples/README.md @@ -337,6 +337,7 @@ Example | Description [Compute - Game of Life](../examples/shader/compute_shader_game_of_life.rs) | A compute shader that simulates Conway's Game of Life [Custom Vertex Attribute](../examples/shader/custom_vertex_attribute.rs) | A shader that reads a mesh's custom vertex attribute [Extended Material](../examples/shader/extended_material.rs) | A custom shader that builds on the standard material +[GPU readback](../examples/shader/gpu_readback.rs) | A very simple compute shader that writes to a buffer that is read by the cpu [Instancing](../examples/shader/shader_instancing.rs) | A shader that renders a mesh multiple times in one draw call [Material](../examples/shader/shader_material.rs) | A shader and a material that uses it [Material](../examples/shader/shader_material_2d.rs) | A shader and a material that uses it on a 2d mesh diff --git a/examples/shader/gpu_readback.rs b/examples/shader/gpu_readback.rs new file mode 100644 index 0000000000..8cdfb5e2da --- /dev/null +++ b/examples/shader/gpu_readback.rs @@ -0,0 +1,297 @@ +//! A very simple compute shader that updates a gpu buffer. +//! That buffer is then copied to the cpu and sent to the main world. +//! +//! This example is not meant to teach compute shaders. +//! It is only meant to explain how to read a gpu buffer on the cpu and then use it in the main world. +//! +//! The code is based on this wgpu example: +//! + +use bevy::{ + prelude::*, + render::{ + render_graph::{self, RenderGraph, RenderLabel}, + render_resource::{binding_types::storage_buffer, *}, + renderer::{RenderContext, RenderDevice}, + Render, RenderApp, RenderSet, + }, +}; +use crossbeam_channel::{Receiver, Sender}; + +// The length of the buffer sent to the gpu +const BUFFER_LEN: usize = 16; + +// To communicate between the main world and the render world we need a channel. +// Since the main world and render world run in parallel, there will always be a frame of latency +// between the data sent from the render world and the data received in the main world +// +// frame n => render world sends data through the channel at the end of the frame +// frame n + 1 => main world receives the data + +/// This will receive asynchronously any data sent from the render world +#[derive(Resource, Deref)] +struct MainWorldReceiver(Receiver>); + +/// This will send asynchronously any data to the main world +#[derive(Resource, Deref)] +struct RenderWorldSender(Sender>); + +fn main() { + App::new() + .insert_resource(ClearColor(Color::BLACK)) + .add_plugins((DefaultPlugins, GpuReadbackPlugin)) + .add_systems(Update, receive) + .run(); +} + +/// This system will poll the channel and try to get the data sent from the render world +fn receive(receiver: Res) { + // We don't want to block the main world on this, + // so we use try_recv which attempts to receive without blocking + if let Ok(data) = receiver.try_recv() { + println!("Received data from render world: {data:?}"); + } +} + +// We need a plugin to organize all the systems and render node required for this example +struct GpuReadbackPlugin; +impl Plugin for GpuReadbackPlugin { + fn build(&self, _app: &mut App) {} + + // The render device is only accessible inside finish(). + // So we need to initialize render resources here. + fn finish(&self, app: &mut App) { + let (s, r) = crossbeam_channel::unbounded(); + app.insert_resource(MainWorldReceiver(r)); + + let render_app = app.sub_app_mut(RenderApp); + render_app + .insert_resource(RenderWorldSender(s)) + .init_resource::() + .init_resource::() + .add_systems( + Render, + ( + prepare_bind_group + .in_set(RenderSet::PrepareBindGroups) + // We don't need to recreate the bind group every frame + .run_if(not(resource_exists::)), + // We need to run it after the render graph is done + // because this needs to happen after submit() + map_and_read_buffer.after(RenderSet::Render), + ), + ); + + // Add the compute node as a top level node to the render graph + // This means it will only execute once per frame + render_app + .world_mut() + .resource_mut::() + .add_node(ComputeNodeLabel, ComputeNode::default()); + } +} + +#[derive(Resource)] +struct Buffers { + // The buffer that will be used by the compute shader + gpu_buffer: Buffer, + // The buffer that will be read on the cpu. + // The `gpu_buffer` will be copied to this buffer every frame + cpu_buffer: Buffer, +} + +impl FromWorld for Buffers { + fn from_world(world: &mut World) -> Self { + let render_device = world.resource::(); + let mut init_data = encase::StorageBuffer::new(Vec::new()); + // Init the buffer with 0 + let data = vec![0; BUFFER_LEN]; + init_data.write(&data).expect("Failed to write buffer"); + // The buffer that will be accessed by the gpu + let gpu_buffer = render_device.create_buffer_with_data(&BufferInitDescriptor { + label: Some("gpu_buffer"), + contents: init_data.as_ref(), + usage: BufferUsages::STORAGE | BufferUsages::COPY_SRC, + }); + // For portability reasons, WebGPU draws a distinction between memory that is + // accessible by the CPU and memory that is accessible by the GPU. Only + // buffers accessible by the CPU can be mapped and accessed by the CPU and + // only buffers visible to the GPU can be used in shaders. In order to get + // data from the GPU, we need to use `CommandEncoder::copy_buffer_to_buffer` to + // copy the buffer modified by the GPU into a mappable, CPU-accessible buffer + let cpu_buffer = render_device.create_buffer(&BufferDescriptor { + label: Some("readback_buffer"), + size: (BUFFER_LEN * std::mem::size_of::()) as u64, + usage: BufferUsages::MAP_READ | BufferUsages::COPY_DST, + mapped_at_creation: false, + }); + + Self { + gpu_buffer, + cpu_buffer, + } + } +} + +#[derive(Resource)] +struct GpuBufferBindGroup(BindGroup); + +fn prepare_bind_group( + mut commands: Commands, + pipeline: Res, + render_device: Res, + buffers: Res, +) { + let bind_group = render_device.create_bind_group( + None, + &pipeline.layout, + &BindGroupEntries::single(buffers.gpu_buffer.as_entire_binding()), + ); + commands.insert_resource(GpuBufferBindGroup(bind_group)); +} + +#[derive(Resource)] +struct ComputePipeline { + layout: BindGroupLayout, + pipeline: CachedComputePipelineId, +} + +impl FromWorld for ComputePipeline { + fn from_world(world: &mut World) -> Self { + let render_device = world.resource::(); + let layout = render_device.create_bind_group_layout( + None, + &BindGroupLayoutEntries::single( + ShaderStages::COMPUTE, + storage_buffer::>(false), + ), + ); + let shader = world.load_asset("shaders/gpu_readback.wgsl"); + let pipeline_cache = world.resource::(); + let pipeline = pipeline_cache.queue_compute_pipeline(ComputePipelineDescriptor { + label: Some("GPU readback compute shader".into()), + layout: vec![layout.clone()], + push_constant_ranges: Vec::new(), + shader: shader.clone(), + shader_defs: Vec::new(), + entry_point: "main".into(), + }); + ComputePipeline { layout, pipeline } + } +} + +fn map_and_read_buffer( + render_device: Res, + buffers: Res, + sender: Res, +) { + // Finally time to get our data back from the gpu. + // First we get a buffer slice which represents a chunk of the buffer (which we + // can't access yet). + // We want the whole thing so use unbounded range. + let buffer_slice = buffers.cpu_buffer.slice(..); + + // Now things get complicated. WebGPU, for safety reasons, only allows either the GPU + // or CPU to access a buffer's contents at a time. We need to "map" the buffer which means + // flipping ownership of the buffer over to the CPU and making access legal. We do this + // with `BufferSlice::map_async`. + // + // The problem is that map_async is not an async function so we can't await it. What + // we need to do instead is pass in a closure that will be executed when the slice is + // either mapped or the mapping has failed. + // + // The problem with this is that we don't have a reliable way to wait in the main + // code for the buffer to be mapped and even worse, calling get_mapped_range or + // get_mapped_range_mut prematurely will cause a panic, not return an error. + // + // Using channels solves this as awaiting the receiving of a message from + // the passed closure will force the outside code to wait. It also doesn't hurt + // if the closure finishes before the outside code catches up as the message is + // buffered and receiving will just pick that up. + // + // It may also be worth noting that although on native, the usage of asynchronous + // channels is wholly unnecessary, for the sake of portability to WASM + // we'll use async channels that work on both native and WASM. + + let (s, r) = crossbeam_channel::unbounded::<()>(); + + // Maps the buffer so it can be read on the cpu + buffer_slice.map_async(MapMode::Read, move |r| match r { + // This will execute once the gpu is ready, so after the call to poll() + Ok(_) => s.send(()).expect("Failed to send map update"), + Err(err) => panic!("Failed to map buffer {err}"), + }); + + // In order for the mapping to be completed, one of three things must happen. + // One of those can be calling `Device::poll`. This isn't necessary on the web as devices + // are polled automatically but natively, we need to make sure this happens manually. + // `Maintain::Wait` will cause the thread to wait on native but not on WebGpu. + + // This blocks until the gpu is done executing everything + render_device.poll(Maintain::wait()).panic_on_timeout(); + + // This blocks until the buffer is mapped + r.recv().expect("Failed to receive the map_async message"); + + { + let buffer_view = buffer_slice.get_mapped_range(); + let data = buffer_view + .chunks(std::mem::size_of::()) + .map(|chunk| u32::from_ne_bytes(chunk.try_into().expect("should be a u32"))) + .collect::>(); + sender + .send(data) + .expect("Failed to send data to main world"); + } + + // We need to make sure all `BufferView`'s are dropped before we do what we're about + // to do. + // Unmap so that we can copy to the staging buffer in the next iteration. + buffers.cpu_buffer.unmap(); +} + +/// Label to identify the node in the render graph +#[derive(Debug, Hash, PartialEq, Eq, Clone, RenderLabel)] +struct ComputeNodeLabel; + +/// The node that will execute the compute shader +#[derive(Default)] +struct ComputeNode {} +impl render_graph::Node for ComputeNode { + fn run( + &self, + _graph: &mut render_graph::RenderGraphContext, + render_context: &mut RenderContext, + world: &World, + ) -> Result<(), render_graph::NodeRunError> { + let pipeline_cache = world.resource::(); + let pipeline = world.resource::(); + let bind_group = world.resource::(); + + if let Some(init_pipeline) = pipeline_cache.get_compute_pipeline(pipeline.pipeline) { + let mut pass = + render_context + .command_encoder() + .begin_compute_pass(&ComputePassDescriptor { + label: Some("GPU readback compute pass"), + ..default() + }); + + pass.set_bind_group(0, &bind_group.0, &[]); + pass.set_pipeline(init_pipeline); + pass.dispatch_workgroups(BUFFER_LEN as u32, 1, 1); + } + + // Copy the gpu accessible buffer to the cpu accessible buffer + let buffers = world.resource::(); + render_context.command_encoder().copy_buffer_to_buffer( + &buffers.gpu_buffer, + 0, + &buffers.cpu_buffer, + 0, + (BUFFER_LEN * std::mem::size_of::()) as u64, + ); + + Ok(()) + } +}