Gpu readback (#15419)

# Objective Adds a new `Readback` component to request for readback of a `Handle<Image>` or `Handle<ShaderStorageBuffer>` to the CPU in a future frame. ## Solution We track the `Readback` component and allocate a target buffer to write the gpu resource into and map it back asynchronously, which then fires a trigger on the entity in the main world. This proccess is asynchronous, and generally takes a few frames. ## Showcase ```rust let mut buffer = ShaderStorageBuffer::from(vec![0u32; 16]); buffer.buffer_description.usage |= BufferUsages::COPY_SRC; let buffer = buffers.add(buffer); commands .spawn(Readback::buffer(buffer.clone())) .observe(|trigger: Trigger<ReadbackComplete>| { info!("Buffer data from previous frame {:?}", trigger.event()); }); ``` --------- Co-authored-by: Kristoffer Søholm <k.soeholm@gmail.com> Co-authored-by: IceSentry <IceSentry@users.noreply.github.com>
2024-11-21 20:23:28 +00:00 · 2024-09-30 10:28:55 -07:00 · 2024-09-30 10:28:55 -07:00 · 40c26f80aa
commit 40c26f80aa
parent dd92a7705d
6 changed files with 485 additions and 221 deletions
--- a/assets/shaders/gpu_readback.wgsl
+++ b/assets/shaders/gpu_readback.wgsl
@ -3,10 +3,13 @@

 // This is the data that lives in the gpu only buffer
@group(0) @binding(0) var<storage, read_write> data: array<u32>;
+@group(0) @binding(1) var texture: texture_storage_2d<r32uint, write>;

@compute @workgroup_size(1)
 fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
    // We use the global_id to index the array to make sure we don't
    // access data used in another workgroup
    data[global_id.x] += 1u;
+    // Write the same data to the texture
+    textureStore(texture, vec2<i32>(i32(global_id.x), 0), vec4<u32>(data[global_id.x], 0, 0, 0));
 }
--- a/crates/bevy_render/src/gpu_readback.rs
+++ b/crates/bevy_render/src/gpu_readback.rs
@ -0,0 +1,370 @@
+use crate::{
+    extract_component::ExtractComponentPlugin,
+    prelude::Image,
+    render_asset::RenderAssets,
+    render_resource::{Buffer, BufferUsages, Extent3d, ImageDataLayout, Texture, TextureFormat},
+    renderer::{render_system, RenderDevice},
+    storage::{GpuShaderStorageBuffer, ShaderStorageBuffer},
+    texture::{GpuImage, TextureFormatPixelInfo},
+    ExtractSchedule, MainWorld, Render, RenderApp, RenderSet,
+};
+use async_channel::{Receiver, Sender};
+use bevy_app::{App, Plugin};
+use bevy_asset::Handle;
+use bevy_derive::{Deref, DerefMut};
+use bevy_ecs::schedule::IntoSystemConfigs;
+use bevy_ecs::{
+    change_detection::ResMut,
+    entity::Entity,
+    event::Event,
+    prelude::{Component, Resource, World},
+    system::{Query, Res},
+};
+use bevy_reflect::Reflect;
+use bevy_render_macros::ExtractComponent;
+use bevy_utils::{default, tracing::warn, HashMap};
+use encase::internal::ReadFrom;
+use encase::private::Reader;
+use encase::ShaderType;
+use wgpu::{CommandEncoder, COPY_BYTES_PER_ROW_ALIGNMENT};
+
+/// A plugin that enables reading back gpu buffers and textures to the cpu.
+pub struct GpuReadbackPlugin {
+    /// Describes the number of frames a buffer can be unused before it is removed from the pool in
+    /// order to avoid unnecessary reallocations.
+    max_unused_frames: usize,
+}
+
+impl Default for GpuReadbackPlugin {
+    fn default() -> Self {
+        Self {
+            max_unused_frames: 10,
+        }
+    }
+}
+
+impl Plugin for GpuReadbackPlugin {
+    fn build(&self, app: &mut App) {
+        app.add_plugins(ExtractComponentPlugin::<Readback>::default());
+
+        if let Some(render_app) = app.get_sub_app_mut(RenderApp) {
+            render_app
+                .init_resource::<GpuReadbackBufferPool>()
+                .init_resource::<GpuReadbacks>()
+                .insert_resource(GpuReadbackMaxUnusedFrames(self.max_unused_frames))
+                .add_systems(ExtractSchedule, sync_readbacks.ambiguous_with_all())
+                .add_systems(
+                    Render,
+                    (
+                        prepare_buffers.in_set(RenderSet::PrepareResources),
+                        map_buffers.after(render_system).in_set(RenderSet::Render),
+                    ),
+                );
+        }
+    }
+}
+
+/// A component that registers the wrapped handle for gpu readback, either a texture or a buffer.
+///
+/// Data is read asynchronously and will be triggered on the entity via the [`ReadbackComplete`] event
+/// when complete. If this component is not removed, the readback will be attempted every frame
+#[derive(Component, ExtractComponent, Clone, Debug)]
+pub enum Readback {
+    Texture(Handle<Image>),
+    Buffer(Handle<ShaderStorageBuffer>),
+}
+
+impl Readback {
+    /// Create a readback component for a texture using the given handle.
+    pub fn texture(image: Handle<Image>) -> Self {
+        Self::Texture(image)
+    }
+
+    /// Create a readback component for a buffer using the given handle.
+    pub fn buffer(buffer: Handle<ShaderStorageBuffer>) -> Self {
+        Self::Buffer(buffer)
+    }
+}
+
+/// An event that is triggered when a gpu readback is complete.
+///
+/// The event contains the data as a `Vec<u8>`, which can be interpreted as the raw bytes of the
+/// requested buffer or texture.
+#[derive(Event, Deref, DerefMut, Reflect, Debug)]
+#[reflect(Debug)]
+pub struct ReadbackComplete(pub Vec<u8>);
+
+impl ReadbackComplete {
+    /// Convert the raw bytes of the event to a shader type.
+    pub fn to_shader_type<T: ShaderType + ReadFrom + Default>(&self) -> T {
+        let mut val = T::default();
+        let mut reader = Reader::new::<T>(&self.0, 0).expect("Failed to create Reader");
+        T::read_from(&mut val, &mut reader);
+        val
+    }
+}
+
+#[derive(Resource)]
+struct GpuReadbackMaxUnusedFrames(usize);
+
+struct GpuReadbackBuffer {
+    buffer: Buffer,
+    taken: bool,
+    frames_unused: usize,
+}
+
+#[derive(Resource, Default)]
+struct GpuReadbackBufferPool {
+    // Map of buffer size to list of buffers, with a flag for whether the buffer is taken and how
+    // many frames it has been unused for.
+    // TODO: We could ideally write all readback data to one big buffer per frame, the assumption
+    // here is that very few entities well actually be read back at once, and their size is
+    // unlikely to change.
+    buffers: HashMap<u64, Vec<GpuReadbackBuffer>>,
+}
+
+impl GpuReadbackBufferPool {
+    fn get(&mut self, render_device: &RenderDevice, size: u64) -> Buffer {
+        let buffers = self.buffers.entry(size).or_default();
+
+        // find an untaken buffer for this size
+        if let Some(buf) = buffers.iter_mut().find(|x| !x.taken) {
+            buf.taken = true;
+            buf.frames_unused = 0;
+            return buf.buffer.clone();
+        }
+
+        let buffer = render_device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("Readback Buffer"),
+            size,
+            usage: BufferUsages::COPY_DST | BufferUsages::MAP_READ,
+            mapped_at_creation: false,
+        });
+        buffers.push(GpuReadbackBuffer {
+            buffer: buffer.clone(),
+            taken: true,
+            frames_unused: 0,
+        });
+        buffer
+    }
+
+    // Returns the buffer to the pool so it can be used in a future frame
+    fn return_buffer(&mut self, buffer: &Buffer) {
+        let size = buffer.size();
+        let buffers = self
+            .buffers
+            .get_mut(&size)
+            .expect("Returned buffer of untracked size");
+        if let Some(buf) = buffers.iter_mut().find(|x| x.buffer.id() == buffer.id()) {
+            buf.taken = false;
+        } else {
+            warn!("Returned buffer that was not allocated");
+        }
+    }
+
+    fn update(&mut self, max_unused_frames: usize) {
+        for (_, buffers) in &mut self.buffers {
+            // Tick all the buffers
+            for buf in &mut *buffers {
+                if !buf.taken {
+                    buf.frames_unused += 1;
+                }
+            }
+
+            // Remove buffers that haven't been used for MAX_UNUSED_FRAMES
+            buffers.retain(|x| x.frames_unused < max_unused_frames);
+        }
+
+        // Remove empty buffer sizes
+        self.buffers.retain(|_, buffers| !buffers.is_empty());
+    }
+}
+
+enum ReadbackSource {
+    Texture {
+        texture: Texture,
+        layout: ImageDataLayout,
+        size: Extent3d,
+    },
+    Buffer {
+        src_start: u64,
+        dst_start: u64,
+        buffer: Buffer,
+    },
+}
+
+#[derive(Resource, Default)]
+struct GpuReadbacks {
+    requested: Vec<GpuReadback>,
+    mapped: Vec<GpuReadback>,
+}
+
+struct GpuReadback {
+    pub entity: Entity,
+    pub src: ReadbackSource,
+    pub buffer: Buffer,
+    pub rx: Receiver<(Entity, Buffer, Vec<u8>)>,
+    pub tx: Sender<(Entity, Buffer, Vec<u8>)>,
+}
+
+fn sync_readbacks(
+    mut main_world: ResMut<MainWorld>,
+    mut buffer_pool: ResMut<GpuReadbackBufferPool>,
+    mut readbacks: ResMut<GpuReadbacks>,
+    max_unused_frames: Res<GpuReadbackMaxUnusedFrames>,
+) {
+    readbacks.mapped.retain(|readback| {
+        if let Ok((entity, buffer, result)) = readback.rx.try_recv() {
+            main_world.trigger_targets(ReadbackComplete(result), entity);
+            buffer_pool.return_buffer(&buffer);
+            false
+        } else {
+            true
+        }
+    });
+
+    buffer_pool.update(max_unused_frames.0);
+}
+
+fn prepare_buffers(
+    render_device: Res<RenderDevice>,
+    mut readbacks: ResMut<GpuReadbacks>,
+    mut buffer_pool: ResMut<GpuReadbackBufferPool>,
+    gpu_images: Res<RenderAssets<GpuImage>>,
+    ssbos: Res<RenderAssets<GpuShaderStorageBuffer>>,
+    handles: Query<(Entity, &Readback)>,
+) {
+    for (entity, readback) in handles.iter() {
+        match readback {
+            Readback::Texture(image) => {
+                if let Some(gpu_image) = gpu_images.get(image) {
+                    let size = Extent3d {
+                        width: gpu_image.size.x,
+                        height: gpu_image.size.y,
+                        ..default()
+                    };
+                    let layout = layout_data(size.width, size.height, gpu_image.texture_format);
+                    let buffer = buffer_pool.get(
+                        &render_device,
+                        get_aligned_size(
+                            size.width,
+                            size.height,
+                            gpu_image.texture_format.pixel_size() as u32,
+                        ) as u64,
+                    );
+                    let (tx, rx) = async_channel::bounded(1);
+                    readbacks.requested.push(GpuReadback {
+                        entity,
+                        src: ReadbackSource::Texture {
+                            texture: gpu_image.texture.clone(),
+                            layout,
+                            size,
+                        },
+                        buffer,
+                        rx,
+                        tx,
+                    });
+                }
+            }
+            Readback::Buffer(buffer) => {
+                if let Some(ssbo) = ssbos.get(buffer) {
+                    let size = ssbo.buffer.size();
+                    let buffer = buffer_pool.get(&render_device, size);
+                    let (tx, rx) = async_channel::bounded(1);
+                    readbacks.requested.push(GpuReadback {
+                        entity,
+                        src: ReadbackSource::Buffer {
+                            src_start: 0,
+                            dst_start: 0,
+                            buffer: ssbo.buffer.clone(),
+                        },
+                        buffer,
+                        rx,
+                        tx,
+                    });
+                }
+            }
+        }
+    }
+}
+
+pub(crate) fn submit_readback_commands(world: &World, command_encoder: &mut CommandEncoder) {
+    let readbacks = world.resource::<GpuReadbacks>();
+    for readback in &readbacks.requested {
+        match &readback.src {
+            ReadbackSource::Texture {
+                texture,
+                layout,
+                size,
+            } => {
+                command_encoder.copy_texture_to_buffer(
+                    texture.as_image_copy(),
+                    wgpu::ImageCopyBuffer {
+                        buffer: &readback.buffer,
+                        layout: *layout,
+                    },
+                    *size,
+                );
+            }
+            ReadbackSource::Buffer {
+                src_start,
+                dst_start,
+                buffer,
+            } => {
+                command_encoder.copy_buffer_to_buffer(
+                    buffer,
+                    *src_start,
+                    &readback.buffer,
+                    *dst_start,
+                    buffer.size(),
+                );
+            }
+        }
+    }
+}
+
+/// Move requested readbacks to mapped readbacks after commands have been submitted in render system
+fn map_buffers(mut readbacks: ResMut<GpuReadbacks>) {
+    let requested = readbacks.requested.drain(..).collect::<Vec<GpuReadback>>();
+    for readback in requested {
+        let slice = readback.buffer.slice(..);
+        let entity = readback.entity;
+        let buffer = readback.buffer.clone();
+        let tx = readback.tx.clone();
+        slice.map_async(wgpu::MapMode::Read, move |res| {
+            res.expect("Failed to map buffer");
+            let buffer_slice = buffer.slice(..);
+            let data = buffer_slice.get_mapped_range();
+            let result = Vec::from(&*data);
+            drop(data);
+            buffer.unmap();
+            if let Err(e) = tx.try_send((entity, buffer, result)) {
+                warn!("Failed to send readback result: {:?}", e);
+            }
+        });
+        readbacks.mapped.push(readback);
+    }
+}
+
+// Utils
+
+pub(crate) fn align_byte_size(value: u32) -> u32 {
+    value + (COPY_BYTES_PER_ROW_ALIGNMENT - (value % COPY_BYTES_PER_ROW_ALIGNMENT))
+}
+
+pub(crate) fn get_aligned_size(width: u32, height: u32, pixel_size: u32) -> u32 {
+    height * align_byte_size(width * pixel_size)
+}
+
+pub(crate) fn layout_data(width: u32, height: u32, format: TextureFormat) -> ImageDataLayout {
+    ImageDataLayout {
+        bytes_per_row: if height > 1 {
+            // 1 = 1 row
+            Some(get_aligned_size(width, 1, format.pixel_size() as u32))
+        } else {
+            None
+        },
+        rows_per_image: None,
+        ..Default::default()
+    }
+}
--- a/crates/bevy_render/src/lib.rs
+++ b/crates/bevy_render/src/lib.rs
@ -25,6 +25,7 @@ mod extract_param;
 pub mod extract_resource;
 pub mod globals;
 pub mod gpu_component_array_buffer;
+pub mod gpu_readback;
 pub mod mesh;
 #[cfg(not(target_arch = "wasm32"))]
 pub mod pipelined_rendering;
@ -73,6 +74,7 @@ use globals::GlobalsPlugin;
 use render_asset::RenderAssetBytesPerFrame;
 use renderer::{RenderAdapter, RenderAdapterInfo, RenderDevice, RenderQueue};

+use crate::gpu_readback::GpuReadbackPlugin;
 use crate::{
    camera::CameraPlugin,
    mesh::{morph::MorphPlugin, MeshPlugin, RenderMesh},
@ -363,6 +365,7 @@ impl Plugin for RenderPlugin {
            MorphPlugin,
            BatchingPlugin,
            StoragePlugin,
+            GpuReadbackPlugin::default(),
        ));

        app.init_resource::<RenderAssetBytesPerFrame>()
--- a/crates/bevy_render/src/renderer/mod.rs
+++ b/crates/bevy_render/src/renderer/mod.rs
@ -46,6 +46,7 @@ pub fn render_system(world: &mut World, state: &mut SystemState<Query<Entity, Wi
        world,
        |encoder| {
            crate::view::screenshot::submit_screenshot_commands(world, encoder);
+            crate::gpu_readback::submit_readback_commands(world, encoder);
        },
    );

--- a/crates/bevy_render/src/view/window/screenshot.rs
+++ b/crates/bevy_render/src/view/window/screenshot.rs
@ -1,6 +1,7 @@
 use super::ExtractedWindows;
 use crate::{
    camera::{ManualTextureViewHandle, ManualTextureViews, NormalizedRenderTarget, RenderTarget},
+    gpu_readback,
    prelude::{Image, Shader},
    render_asset::{RenderAssetUsages, RenderAssets},
    render_resource::{
@ -38,9 +39,7 @@ use std::{
        Mutex,
    },
 };
-use wgpu::{
-    CommandEncoder, Extent3d, ImageDataLayout, TextureFormat, COPY_BYTES_PER_ROW_ALIGNMENT,
-};
+use wgpu::{CommandEncoder, Extent3d, TextureFormat};

 #[derive(Event, Deref, DerefMut, Reflect, Debug)]
 #[reflect(Debug)]
@ -376,7 +375,8 @@ fn prepare_screenshot_state(
    let texture_view = texture.create_view(&Default::default());
    let buffer = render_device.create_buffer(&wgpu::BufferDescriptor {
        label: Some("screenshot-transfer-buffer"),
-        size: get_aligned_size(size.width, size.height, format.pixel_size() as u32) as u64,
+        size: gpu_readback::get_aligned_size(size.width, size.height, format.pixel_size() as u32)
+            as u64,
        usage: BufferUsages::MAP_READ | BufferUsages::COPY_DST,
        mapped_at_creation: false,
    });
@ -445,27 +445,6 @@ impl Plugin for ScreenshotPlugin {
    }
 }

-pub(crate) fn align_byte_size(value: u32) -> u32 {
-    value + (COPY_BYTES_PER_ROW_ALIGNMENT - (value % COPY_BYTES_PER_ROW_ALIGNMENT))
-}
-
-pub(crate) fn get_aligned_size(width: u32, height: u32, pixel_size: u32) -> u32 {
-    height * align_byte_size(width * pixel_size)
-}
-
-pub(crate) fn layout_data(width: u32, height: u32, format: TextureFormat) -> ImageDataLayout {
-    ImageDataLayout {
-        bytes_per_row: if height > 1 {
-            // 1 = 1 row
-            Some(get_aligned_size(width, 1, format.pixel_size() as u32))
-        } else {
-            None
-        },
-        rows_per_image: None,
-        ..Default::default()
-    }
-}
-
 #[derive(Resource)]
 pub struct ScreenshotToScreenPipeline {
    pub bind_group_layout: BindGroupLayout,
@ -619,7 +598,7 @@ fn render_screenshot(
            prepared_state.texture.as_image_copy(),
            wgpu::ImageCopyBuffer {
                buffer: &prepared_state.buffer,
-                layout: layout_data(width, height, texture_format),
+                layout: gpu_readback::layout_data(width, height, texture_format),
            },
            Extent3d {
                width,
@ -687,7 +666,8 @@ pub(crate) fn collect_screenshots(world: &mut World) {
                // Our buffer has been padded because we needed to align to a multiple of 256.
                // We remove this padding here
                let initial_row_bytes = width as usize * pixel_size;
-                let buffered_row_bytes = align_byte_size(width * pixel_size as u32) as usize;
+                let buffered_row_bytes =
+                    gpu_readback::align_byte_size(width * pixel_size as u32) as usize;

                let mut take_offset = buffered_row_bytes;
                let mut place_offset = initial_row_bytes;
--- a/examples/shader/gpu_readback.rs
+++ b/examples/shader/gpu_readback.rs
@ -1,22 +1,24 @@
-//! A very simple compute shader that updates a gpu buffer.
-//! That buffer is then copied to the cpu and sent to the main world.
-//!
-//! This example is not meant to teach compute shaders.
-//! It is only meant to explain how to read a gpu buffer on the cpu and then use it in the main world.
-//!
-//! The code is based on this wgpu example:
-//! <https://github.com/gfx-rs/wgpu/blob/fb305b85f692f3fbbd9509b648dfbc97072f7465/examples/src/repeated_compute/mod.rs>
+//! Simple example demonstrating the use of the [`Readback`] component to read back data from the GPU
+//! using both a storage buffer and texture.

 use bevy::{
    prelude::*,
    render::{
-        render_graph::{self, RenderGraph, RenderLabel},
-        render_resource::{binding_types::storage_buffer, *},
-        renderer::{RenderContext, RenderDevice, RenderQueue},
+        extract_resource::{ExtractResource, ExtractResourcePlugin},
+        gpu_readback::{Readback, ReadbackComplete},
+        render_asset::{RenderAssetUsages, RenderAssets},
+        render_graph,
+        render_graph::{RenderGraph, RenderLabel},
+        render_resource::{
+            binding_types::{storage_buffer, texture_storage_2d},
+            *,
+        },
+        renderer::{RenderContext, RenderDevice},
+        storage::{GpuShaderStorageBuffer, ShaderStorageBuffer},
+        texture::GpuImage,
        Render, RenderApp, RenderSet,
    },
 };
-use crossbeam_channel::{Receiver, Sender};

 /// This example uses a shader source file from the assets subdirectory
 const SHADER_ASSET_PATH: &str = "shaders/gpu_readback.wgsl";
@ -24,66 +26,33 @@ const SHADER_ASSET_PATH: &str = "shaders/gpu_readback.wgsl";
 // The length of the buffer sent to the gpu
 const BUFFER_LEN: usize = 16;

-// To communicate between the main world and the render world we need a channel.
-// Since the main world and render world run in parallel, there will always be a frame of latency
-// between the data sent from the render world and the data received in the main world
-//
-// frame n => render world sends data through the channel at the end of the frame
-// frame n + 1 => main world receives the data
-
-/// This will receive asynchronously any data sent from the render world
-#[derive(Resource, Deref)]
-struct MainWorldReceiver(Receiver<Vec<u32>>);
-
-/// This will send asynchronously any data to the main world
-#[derive(Resource, Deref)]
-struct RenderWorldSender(Sender<Vec<u32>>);
-
 fn main() {
    App::new()
+        .add_plugins((
+            DefaultPlugins,
+            GpuReadbackPlugin,
+            ExtractResourcePlugin::<ReadbackBuffer>::default(),
+            ExtractResourcePlugin::<ReadbackImage>::default(),
+        ))
        .insert_resource(ClearColor(Color::BLACK))
-        .add_plugins((DefaultPlugins, GpuReadbackPlugin))
-        .add_systems(Update, receive)
+        .add_systems(Startup, setup)
        .run();
 }

-/// This system will poll the channel and try to get the data sent from the render world
-fn receive(receiver: Res<MainWorldReceiver>) {
-    // We don't want to block the main world on this,
-    // so we use try_recv which attempts to receive without blocking
-    if let Ok(data) = receiver.try_recv() {
-        println!("Received data from render world: {data:?}");
-    }
-}
-
 // We need a plugin to organize all the systems and render node required for this example
 struct GpuReadbackPlugin;
 impl Plugin for GpuReadbackPlugin {
    fn build(&self, _app: &mut App) {}

-    // The render device is only accessible inside finish().
-    // So we need to initialize render resources here.
    fn finish(&self, app: &mut App) {
-        let (s, r) = crossbeam_channel::unbounded();
-        app.insert_resource(MainWorldReceiver(r));
-
        let render_app = app.sub_app_mut(RenderApp);
-        render_app
-            .insert_resource(RenderWorldSender(s))
-            .init_resource::<ComputePipeline>()
-            .init_resource::<Buffers>()
-            .add_systems(
-                Render,
-                (
-                    prepare_bind_group
-                        .in_set(RenderSet::PrepareBindGroups)
-                        // We don't need to recreate the bind group every frame
-                        .run_if(not(resource_exists::<GpuBufferBindGroup>)),
-                    // We need to run it after the render graph is done
-                    // because this needs to happen after submit()
-                    map_and_read_buffer.after(RenderSet::Render),
-                ),
-            );
+        render_app.init_resource::<ComputePipeline>().add_systems(
+            Render,
+            prepare_bind_group
+                .in_set(RenderSet::PrepareBindGroups)
+                // We don't need to recreate the bind group every frame
+                .run_if(not(resource_exists::<GpuBufferBindGroup>)),
+        );

        // Add the compute node as a top level node to the render graph
        // This means it will only execute once per frame
@ -94,51 +63,68 @@ impl Plugin for GpuReadbackPlugin {
    }
 }

-/// Holds the buffers that will be used to communicate between the cpu and gpu
-#[derive(Resource)]
-struct Buffers {
-    /// The buffer that will be used by the compute shader
-    ///
-    /// In this example, we want to write a `Vec<u32>` to a `Buffer`. `BufferVec` is a wrapper around a `Buffer`
-    /// that will make sure the data is correctly aligned for the gpu and will simplify uploading the data to the gpu.
-    gpu_buffer: BufferVec<u32>,
-    /// The buffer that will be read on the cpu.
-    /// The `gpu_buffer` will be copied to this buffer every frame
-    cpu_buffer: Buffer,
-}
+#[derive(Resource, ExtractResource, Clone)]
+struct ReadbackBuffer(Handle<ShaderStorageBuffer>);

-impl FromWorld for Buffers {
-    fn from_world(world: &mut World) -> Self {
-        let render_device = world.resource::<RenderDevice>();
-        let render_queue = world.resource::<RenderQueue>();
+#[derive(Resource, ExtractResource, Clone)]
+struct ReadbackImage(Handle<Image>);

-        // Create the buffer that will be accessed by the gpu
-        let mut gpu_buffer = BufferVec::new(BufferUsages::STORAGE | BufferUsages::COPY_SRC);
-        for _ in 0..BUFFER_LEN {
-            // Init the buffer with zeroes
-            gpu_buffer.push(0);
-        }
-        // Write the buffer so the data is accessible on the gpu
-        gpu_buffer.write_buffer(render_device, render_queue);
+fn setup(
+    mut commands: Commands,
+    mut images: ResMut<Assets<Image>>,
+    mut buffers: ResMut<Assets<ShaderStorageBuffer>>,
+) {
+    // Create a storage buffer with some data
+    let buffer = vec![0u32; BUFFER_LEN];
+    let mut buffer = ShaderStorageBuffer::from(buffer);
+    // We need to enable the COPY_SRC usage so we can copy the buffer to the cpu
+    buffer.buffer_description.usage |= BufferUsages::COPY_SRC;
+    let buffer = buffers.add(buffer);

-        // For portability reasons, WebGPU draws a distinction between memory that is
-        // accessible by the CPU and memory that is accessible by the GPU. Only
-        // buffers accessible by the CPU can be mapped and accessed by the CPU and
-        // only buffers visible to the GPU can be used in shaders. In order to get
-        // data from the GPU, we need to use `CommandEncoder::copy_buffer_to_buffer` to
-        // copy the buffer modified by the GPU into a mappable, CPU-accessible buffer
-        let cpu_buffer = render_device.create_buffer(&BufferDescriptor {
-            label: Some("readback_buffer"),
-            size: (BUFFER_LEN * size_of::<u32>()) as u64,
-            usage: BufferUsages::MAP_READ | BufferUsages::COPY_DST,
-            mapped_at_creation: false,
-        });
+    // Create a storage texture with some data
+    let size = Extent3d {
+        width: BUFFER_LEN as u32,
+        height: 1,
+        ..default()
+    };
+    let mut image = Image::new_fill(
+        size,
+        TextureDimension::D2,
+        &[0, 0, 0, 0],
+        TextureFormat::R32Uint,
+        RenderAssetUsages::RENDER_WORLD,
+    );
+    // We also need to enable the COPY_SRC, as well as STORAGE_BINDING so we can use it in the
+    // compute shader
+    image.texture_descriptor.usage |= TextureUsages::COPY_SRC | TextureUsages::STORAGE_BINDING;
+    let image = images.add(image);

-        Self {
-            gpu_buffer,
-            cpu_buffer,
-        }
-    }
+    // Spawn the readback components. For each frame, the data will be read back from the GPU
+    // asynchronously and trigger the `ReadbackComplete` event on this entity. Despawn the entity
+    // to stop reading back the data.
+    commands.spawn(Readback::buffer(buffer.clone())).observe(
+        |trigger: Trigger<ReadbackComplete>| {
+            // This matches the type which was used to create the `ShaderStorageBuffer` above,
+            // and is a convenient way to interpret the data.
+            let data: Vec<u32> = trigger.event().to_shader_type();
+            info!("Buffer {:?}", data);
+        },
+    );
+    // This is just a simple way to pass the buffer handle to the render app for our compute node
+    commands.insert_resource(ReadbackBuffer(buffer));
+
+    // Textures can also be read back from the GPU. Pay careful attention to the format of the
+    // texture, as it will affect how the data is interpreted.
+    commands.spawn(Readback::texture(image.clone())).observe(
+        |trigger: Trigger<ReadbackComplete>| {
+            // You probably want to interpret the data as a color rather than a `ShaderType`,
+            // but in this case we know the data is a single channel storage texture, so we can
+            // interpret it as a `Vec<u32>`
+            let data: Vec<u32> = trigger.event().to_shader_type();
+            info!("Image {:?}", data);
+        },
+    );
+    commands.insert_resource(ReadbackImage(image));
 }

 #[derive(Resource)]
@ -148,18 +134,20 @@ fn prepare_bind_group(
    mut commands: Commands,
    pipeline: Res<ComputePipeline>,
    render_device: Res<RenderDevice>,
-    buffers: Res<Buffers>,
+    buffer: Res<ReadbackBuffer>,
+    image: Res<ReadbackImage>,
+    buffers: Res<RenderAssets<GpuShaderStorageBuffer>>,
+    images: Res<RenderAssets<GpuImage>>,
 ) {
+    let buffer = buffers.get(&buffer.0).unwrap();
+    let image = images.get(&image.0).unwrap();
    let bind_group = render_device.create_bind_group(
        None,
        &pipeline.layout,
-        &BindGroupEntries::single(
-            buffers
-                .gpu_buffer
-                .binding()
-                // We already did it when creating the buffer so this should never happen
-                .expect("Buffer should have already been uploaded to the gpu"),
-        ),
+        &BindGroupEntries::sequential((
+            buffer.buffer.as_entire_buffer_binding(),
+            image.texture_view.into_binding(),
+        )),
    );
    commands.insert_resource(GpuBufferBindGroup(bind_group));
 }
@ -175,9 +163,12 @@ impl FromWorld for ComputePipeline {
        let render_device = world.resource::<RenderDevice>();
        let layout = render_device.create_bind_group_layout(
            None,
-            &BindGroupLayoutEntries::single(
+            &BindGroupLayoutEntries::sequential(
                ShaderStages::COMPUTE,
-                storage_buffer::<Vec<u32>>(false),
+                (
+                    storage_buffer::<Vec<u32>>(false),
+                    texture_storage_2d(TextureFormat::R32Uint, StorageTextureAccess::WriteOnly),
+                ),
            ),
        );
        let shader = world.load_asset(SHADER_ASSET_PATH);
@ -194,76 +185,6 @@ impl FromWorld for ComputePipeline {
    }
 }

-fn map_and_read_buffer(
-    render_device: Res<RenderDevice>,
-    buffers: Res<Buffers>,
-    sender: Res<RenderWorldSender>,
-) {
-    // Finally time to get our data back from the gpu.
-    // First we get a buffer slice which represents a chunk of the buffer (which we
-    // can't access yet).
-    // We want the whole thing so use unbounded range.
-    let buffer_slice = buffers.cpu_buffer.slice(..);
-
-    // Now things get complicated. WebGPU, for safety reasons, only allows either the GPU
-    // or CPU to access a buffer's contents at a time. We need to "map" the buffer which means
-    // flipping ownership of the buffer over to the CPU and making access legal. We do this
-    // with `BufferSlice::map_async`.
-    //
-    // The problem is that map_async is not an async function so we can't await it. What
-    // we need to do instead is pass in a closure that will be executed when the slice is
-    // either mapped or the mapping has failed.
-    //
-    // The problem with this is that we don't have a reliable way to wait in the main
-    // code for the buffer to be mapped and even worse, calling get_mapped_range or
-    // get_mapped_range_mut prematurely will cause a panic, not return an error.
-    //
-    // Using channels solves this as awaiting the receiving of a message from
-    // the passed closure will force the outside code to wait. It also doesn't hurt
-    // if the closure finishes before the outside code catches up as the message is
-    // buffered and receiving will just pick that up.
-    //
-    // It may also be worth noting that although on native, the usage of asynchronous
-    // channels is wholly unnecessary, for the sake of portability to Wasm
-    // we'll use async channels that work on both native and Wasm.
-
-    let (s, r) = crossbeam_channel::unbounded::<()>();
-
-    // Maps the buffer so it can be read on the cpu
-    buffer_slice.map_async(MapMode::Read, move |r| match r {
-        // This will execute once the gpu is ready, so after the call to poll()
-        Ok(_) => s.send(()).expect("Failed to send map update"),
-        Err(err) => panic!("Failed to map buffer {err}"),
-    });
-
-    // In order for the mapping to be completed, one of three things must happen.
-    // One of those can be calling `Device::poll`. This isn't necessary on the web as devices
-    // are polled automatically but natively, we need to make sure this happens manually.
-    // `Maintain::Wait` will cause the thread to wait on native but not on WebGpu.
-
-    // This blocks until the gpu is done executing everything
-    render_device.poll(Maintain::wait()).panic_on_timeout();
-
-    // This blocks until the buffer is mapped
-    r.recv().expect("Failed to receive the map_async message");
-
-    {
-        let buffer_view = buffer_slice.get_mapped_range();
-        let data = buffer_view
-            .chunks(size_of::<u32>())
-            .map(|chunk| u32::from_ne_bytes(chunk.try_into().expect("should be a u32")))
-            .collect::<Vec<u32>>();
-        sender
-            .send(data)
-            .expect("Failed to send data to main world");
-    }
-
-    // We need to make sure all `BufferView`'s are dropped before we do what we're about
-    // to do.
-    // Unmap so that we can copy to the staging buffer in the next iteration.
-    buffers.cpu_buffer.unmap();
-}
-
 /// Label to identify the node in the render graph
 #[derive(Debug, Hash, PartialEq, Eq, Clone, RenderLabel)]
 struct ComputeNodeLabel;
@ -295,20 +216,6 @@ impl render_graph::Node for ComputeNode {
            pass.set_pipeline(init_pipeline);
            pass.dispatch_workgroups(BUFFER_LEN as u32, 1, 1);
        }
-
-        // Copy the gpu accessible buffer to the cpu accessible buffer
-        let buffers = world.resource::<Buffers>();
-        render_context.command_encoder().copy_buffer_to_buffer(
-            buffers
-                .gpu_buffer
-                .buffer()
-                .expect("Buffer should have already been uploaded to the gpu"),
-            0,
-            &buffers.cpu_buffer,
-            0,
-            (BUFFER_LEN * size_of::<u32>()) as u64,
-        );
-
        Ok(())
    }
 }