Add gpu readback example (#12877)

# Objective - It's pretty common for users to want to read data back from the gpu and into the main world ## Solution - Add a simple example that shows how to read data back from the gpu and send it to the main world using a channel. - The example is largely based on this wgpu example but adapted to bevy - fb305b85f6/examples/src/repeated_compute/mod.rs --------- Co-authored-by: stormy <120167078+stowmyy@users.noreply.github.com> Co-authored-by: Torstein Grindvik <52322338+torsteingrindvik@users.noreply.github.com>
2024-11-21 20:23:28 +00:00 · 2024-04-08 13:08:20 -04:00 · 2024-04-08 13:08:20 -04:00 · 08b41878d7
commit 08b41878d7
parent 2ee69807b1
4 changed files with 321 additions and 0 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -2125,6 +2125,17 @@ description = "A compute shader that simulates Conway's Game of Life"
 category = "Shaders"
 wasm = false

+[[example]]
+name = "gpu_readback"
+path = "examples/shader/gpu_readback.rs"
+doc-scrape-examples = true
+
+[package.metadata.example.gpu_readback]
+name = "GPU readback"
+description = "A very simple compute shader that writes to a buffer that is read by the cpu"
+category = "Shaders"
+wasm = false
+
 [[example]]
 name = "array_texture"
 path = "examples/shader/array_texture.rs"
--- a/assets/shaders/gpu_readback.wgsl
+++ b/assets/shaders/gpu_readback.wgsl
@ -0,0 +1,12 @@
+// This shader is used for the gpu_readback example
+// The actual work it does is not important for the example
+
+// This is the data that lives in the gpu only buffer
+@group(0) @binding(0) var<storage, read_write> data: array<u32>;
+
+@compute @workgroup_size(1)
+fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
+    // We use the global_id to index the array to make sure we don't
+    // access data used in another workgroup
+    data[global_id.x] += 1u;
+}
--- a/examples/README.md
+++ b/examples/README.md
@ -337,6 +337,7 @@ Example | Description
 [Compute - Game of Life](../examples/shader/compute_shader_game_of_life.rs) | A compute shader that simulates Conway's Game of Life
 [Custom Vertex Attribute](../examples/shader/custom_vertex_attribute.rs) | A shader that reads a mesh's custom vertex attribute
 [Extended Material](../examples/shader/extended_material.rs) | A custom shader that builds on the standard material
+[GPU readback](../examples/shader/gpu_readback.rs) | A very simple compute shader that writes to a buffer that is read by the cpu
 [Instancing](../examples/shader/shader_instancing.rs) | A shader that renders a mesh multiple times in one draw call
 [Material](../examples/shader/shader_material.rs) | A shader and a material that uses it
 [Material](../examples/shader/shader_material_2d.rs) | A shader and a material that uses it on a 2d mesh
--- a/examples/shader/gpu_readback.rs
+++ b/examples/shader/gpu_readback.rs
@ -0,0 +1,297 @@
+//! A very simple compute shader that updates a gpu buffer.
+//! That buffer is then copied to the cpu and sent to the main world.
+//!
+//! This example is not meant to teach compute shaders.
+//! It is only meant to explain how to read a gpu buffer on the cpu and then use it in the main world.
+//!
+//! The code is based on this wgpu example:
+//! <https://github.com/gfx-rs/wgpu/blob/fb305b85f692f3fbbd9509b648dfbc97072f7465/examples/src/repeated_compute/mod.rs>
+
+use bevy::{
+    prelude::*,
+    render::{
+        render_graph::{self, RenderGraph, RenderLabel},
+        render_resource::{binding_types::storage_buffer, *},
+        renderer::{RenderContext, RenderDevice},
+        Render, RenderApp, RenderSet,
+    },
+};
+use crossbeam_channel::{Receiver, Sender};
+
+// The length of the buffer sent to the gpu
+const BUFFER_LEN: usize = 16;
+
+// To communicate between the main world and the render world we need a channel.
+// Since the main world and render world run in parallel, there will always be a frame of latency
+// between the data sent from the render world and the data received in the main world
+//
+// frame n => render world sends data through the channel at the end of the frame
+// frame n + 1 => main world receives the data
+
+/// This will receive asynchronously any data sent from the render world
+#[derive(Resource, Deref)]
+struct MainWorldReceiver(Receiver<Vec<u32>>);
+
+/// This will send asynchronously any data to the main world
+#[derive(Resource, Deref)]
+struct RenderWorldSender(Sender<Vec<u32>>);
+
+fn main() {
+    App::new()
+        .insert_resource(ClearColor(Color::BLACK))
+        .add_plugins((DefaultPlugins, GpuReadbackPlugin))
+        .add_systems(Update, receive)
+        .run();
+}
+
+/// This system will poll the channel and try to get the data sent from the render world
+fn receive(receiver: Res<MainWorldReceiver>) {
+    // We don't want to block the main world on this,
+    // so we use try_recv which attempts to receive without blocking
+    if let Ok(data) = receiver.try_recv() {
+        println!("Received data from render world: {data:?}");
+    }
+}
+
+// We need a plugin to organize all the systems and render node required for this example
+struct GpuReadbackPlugin;
+impl Plugin for GpuReadbackPlugin {
+    fn build(&self, _app: &mut App) {}
+
+    // The render device is only accessible inside finish().
+    // So we need to initialize render resources here.
+    fn finish(&self, app: &mut App) {
+        let (s, r) = crossbeam_channel::unbounded();
+        app.insert_resource(MainWorldReceiver(r));
+
+        let render_app = app.sub_app_mut(RenderApp);
+        render_app
+            .insert_resource(RenderWorldSender(s))
+            .init_resource::<ComputePipeline>()
+            .init_resource::<Buffers>()
+            .add_systems(
+                Render,
+                (
+                    prepare_bind_group
+                        .in_set(RenderSet::PrepareBindGroups)
+                        // We don't need to recreate the bind group every frame
+                        .run_if(not(resource_exists::<GpuBufferBindGroup>)),
+                    // We need to run it after the render graph is done
+                    // because this needs to happen after submit()
+                    map_and_read_buffer.after(RenderSet::Render),
+                ),
+            );
+
+        // Add the compute node as a top level node to the render graph
+        // This means it will only execute once per frame
+        render_app
+            .world_mut()
+            .resource_mut::<RenderGraph>()
+            .add_node(ComputeNodeLabel, ComputeNode::default());
+    }
+}
+
+#[derive(Resource)]
+struct Buffers {
+    // The buffer that will be used by the compute shader
+    gpu_buffer: Buffer,
+    // The buffer that will be read on the cpu.
+    // The `gpu_buffer` will be copied to this buffer every frame
+    cpu_buffer: Buffer,
+}
+
+impl FromWorld for Buffers {
+    fn from_world(world: &mut World) -> Self {
+        let render_device = world.resource::<RenderDevice>();
+        let mut init_data = encase::StorageBuffer::new(Vec::new());
+        // Init the buffer with 0
+        let data = vec![0; BUFFER_LEN];
+        init_data.write(&data).expect("Failed to write buffer");
+        // The buffer that will be accessed by the gpu
+        let gpu_buffer = render_device.create_buffer_with_data(&BufferInitDescriptor {
+            label: Some("gpu_buffer"),
+            contents: init_data.as_ref(),
+            usage: BufferUsages::STORAGE | BufferUsages::COPY_SRC,
+        });
+        // For portability reasons, WebGPU draws a distinction between memory that is
+        // accessible by the CPU and memory that is accessible by the GPU. Only
+        // buffers accessible by the CPU can be mapped and accessed by the CPU and
+        // only buffers visible to the GPU can be used in shaders. In order to get
+        // data from the GPU, we need to use `CommandEncoder::copy_buffer_to_buffer` to
+        // copy the buffer modified by the GPU into a mappable, CPU-accessible buffer
+        let cpu_buffer = render_device.create_buffer(&BufferDescriptor {
+            label: Some("readback_buffer"),
+            size: (BUFFER_LEN * std::mem::size_of::<u32>()) as u64,
+            usage: BufferUsages::MAP_READ | BufferUsages::COPY_DST,
+            mapped_at_creation: false,
+        });
+
+        Self {
+            gpu_buffer,
+            cpu_buffer,
+        }
+    }
+}
+
+#[derive(Resource)]
+struct GpuBufferBindGroup(BindGroup);
+
+fn prepare_bind_group(
+    mut commands: Commands,
+    pipeline: Res<ComputePipeline>,
+    render_device: Res<RenderDevice>,
+    buffers: Res<Buffers>,
+) {
+    let bind_group = render_device.create_bind_group(
+        None,
+        &pipeline.layout,
+        &BindGroupEntries::single(buffers.gpu_buffer.as_entire_binding()),
+    );
+    commands.insert_resource(GpuBufferBindGroup(bind_group));
+}
+
+#[derive(Resource)]
+struct ComputePipeline {
+    layout: BindGroupLayout,
+    pipeline: CachedComputePipelineId,
+}
+
+impl FromWorld for ComputePipeline {
+    fn from_world(world: &mut World) -> Self {
+        let render_device = world.resource::<RenderDevice>();
+        let layout = render_device.create_bind_group_layout(
+            None,
+            &BindGroupLayoutEntries::single(
+                ShaderStages::COMPUTE,
+                storage_buffer::<Vec<u32>>(false),
+            ),
+        );
+        let shader = world.load_asset("shaders/gpu_readback.wgsl");
+        let pipeline_cache = world.resource::<PipelineCache>();
+        let pipeline = pipeline_cache.queue_compute_pipeline(ComputePipelineDescriptor {
+            label: Some("GPU readback compute shader".into()),
+            layout: vec![layout.clone()],
+            push_constant_ranges: Vec::new(),
+            shader: shader.clone(),
+            shader_defs: Vec::new(),
+            entry_point: "main".into(),
+        });
+        ComputePipeline { layout, pipeline }
+    }
+}
+
+fn map_and_read_buffer(
+    render_device: Res<RenderDevice>,
+    buffers: Res<Buffers>,
+    sender: Res<RenderWorldSender>,
+) {
+    // Finally time to get our data back from the gpu.
+    // First we get a buffer slice which represents a chunk of the buffer (which we
+    // can't access yet).
+    // We want the whole thing so use unbounded range.
+    let buffer_slice = buffers.cpu_buffer.slice(..);
+
+    // Now things get complicated. WebGPU, for safety reasons, only allows either the GPU
+    // or CPU to access a buffer's contents at a time. We need to "map" the buffer which means
+    // flipping ownership of the buffer over to the CPU and making access legal. We do this
+    // with `BufferSlice::map_async`.
+    //
+    // The problem is that map_async is not an async function so we can't await it. What
+    // we need to do instead is pass in a closure that will be executed when the slice is
+    // either mapped or the mapping has failed.
+    //
+    // The problem with this is that we don't have a reliable way to wait in the main
+    // code for the buffer to be mapped and even worse, calling get_mapped_range or
+    // get_mapped_range_mut prematurely will cause a panic, not return an error.
+    //
+    // Using channels solves this as awaiting the receiving of a message from
+    // the passed closure will force the outside code to wait. It also doesn't hurt
+    // if the closure finishes before the outside code catches up as the message is
+    // buffered and receiving will just pick that up.
+    //
+    // It may also be worth noting that although on native, the usage of asynchronous
+    // channels is wholly unnecessary, for the sake of portability to WASM
+    // we'll use async channels that work on both native and WASM.
+
+    let (s, r) = crossbeam_channel::unbounded::<()>();
+
+    // Maps the buffer so it can be read on the cpu
+    buffer_slice.map_async(MapMode::Read, move |r| match r {
+        // This will execute once the gpu is ready, so after the call to poll()
+        Ok(_) => s.send(()).expect("Failed to send map update"),
+        Err(err) => panic!("Failed to map buffer {err}"),
+    });
+
+    // In order for the mapping to be completed, one of three things must happen.
+    // One of those can be calling `Device::poll`. This isn't necessary on the web as devices
+    // are polled automatically but natively, we need to make sure this happens manually.
+    // `Maintain::Wait` will cause the thread to wait on native but not on WebGpu.
+
+    // This blocks until the gpu is done executing everything
+    render_device.poll(Maintain::wait()).panic_on_timeout();
+
+    // This blocks until the buffer is mapped
+    r.recv().expect("Failed to receive the map_async message");
+
+    {
+        let buffer_view = buffer_slice.get_mapped_range();
+        let data = buffer_view
+            .chunks(std::mem::size_of::<u32>())
+            .map(|chunk| u32::from_ne_bytes(chunk.try_into().expect("should be a u32")))
+            .collect::<Vec<u32>>();
+        sender
+            .send(data)
+            .expect("Failed to send data to main world");
+    }
+
+    // We need to make sure all `BufferView`'s are dropped before we do what we're about
+    // to do.
+    // Unmap so that we can copy to the staging buffer in the next iteration.
+    buffers.cpu_buffer.unmap();
+}
+
+/// Label to identify the node in the render graph
+#[derive(Debug, Hash, PartialEq, Eq, Clone, RenderLabel)]
+struct ComputeNodeLabel;
+
+/// The node that will execute the compute shader
+#[derive(Default)]
+struct ComputeNode {}
+impl render_graph::Node for ComputeNode {
+    fn run(
+        &self,
+        _graph: &mut render_graph::RenderGraphContext,
+        render_context: &mut RenderContext,
+        world: &World,
+    ) -> Result<(), render_graph::NodeRunError> {
+        let pipeline_cache = world.resource::<PipelineCache>();
+        let pipeline = world.resource::<ComputePipeline>();
+        let bind_group = world.resource::<GpuBufferBindGroup>();
+
+        if let Some(init_pipeline) = pipeline_cache.get_compute_pipeline(pipeline.pipeline) {
+            let mut pass =
+                render_context
+                    .command_encoder()
+                    .begin_compute_pass(&ComputePassDescriptor {
+                        label: Some("GPU readback compute pass"),
+                        ..default()
+                    });
+
+            pass.set_bind_group(0, &bind_group.0, &[]);
+            pass.set_pipeline(init_pipeline);
+            pass.dispatch_workgroups(BUFFER_LEN as u32, 1, 1);
+        }
+
+        // Copy the gpu accessible buffer to the cpu accessible buffer
+        let buffers = world.resource::<Buffers>();
+        render_context.command_encoder().copy_buffer_to_buffer(
+            &buffers.gpu_buffer,
+            0,
+            &buffers.cpu_buffer,
+            0,
+            (BUFFER_LEN * std::mem::size_of::<u32>()) as u64,
+        );
+
+        Ok(())
+    }
+}