Optimize CUDA GL interop and provide fallback

2024-12-15 13:52:28 +00:00 · 2021-12-13 21:41:00 -06:00 · 2021-12-13 21:41:00 -06:00 · 36e0468a99
commit 36e0468a99
parent c4d85cf928
4 changed files with 168 additions and 62 deletions
--- a/app/streaming/video/ffmpeg-renderers/cuda.cpp
+++ b/app/streaming/video/ffmpeg-renderers/cuda.cpp
@ -1,13 +1,7 @@
 #include "cuda.h"
 #include <ffnvcodec/dynlink_loader.h>
 #include <SDL_opengl.h>
 extern "C" {
    #include <libavutil/hwcontext_cuda.h>
 }
 CUDARenderer::CUDARenderer()
    : m_HwContext(nullptr)
 {
@ -63,35 +57,47 @@ bool CUDARenderer::isDirectRenderingSupported()
    return false;
 }
-bool CUDARenderer::copyCudaFrameToBoundTexture(AVFrame* frame)
+CUDAGLInteropHelper::CUDAGLInteropHelper(AVHWDeviceContext* context)
    : m_Funcs(nullptr),
      m_Context((AVCUDADeviceContext*)context->hwctx)
 {
-    static CudaFunctions* funcs;
+    memset(m_Resources, 0, sizeof(m_Resources));
    CUresult err;
    AVCUDADeviceContext* devCtx = (AVCUDADeviceContext*)(((AVHWFramesContext*)frame->hw_frames_ctx->data)->device_ctx->hwctx);
    bool ret = false;
-    if (!funcs) {
+    // One-time init of CUDA library
-        // One-time init of CUDA library
+    cuda_load_functions(&m_Funcs, nullptr);
-        cuda_load_functions(&funcs, nullptr);
+    if (m_Funcs == nullptr) {
-        if (!funcs) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Failed to initialize CUDA library");
-            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Failed to initialize CUDA library");
+        return;
-            return false;
+    }
-        }
+}
 CUDAGLInteropHelper::~CUDAGLInteropHelper()
 {
    unregisterTextures();
    if (m_Funcs != nullptr) {
        cuda_free_functions(&m_Funcs);
    }
 }
 bool CUDAGLInteropHelper::registerBoundTextures()
 {
    int err;
    if (m_Funcs == nullptr) {
        // Already logged in constructor
        return false;
    }
    SDL_assert(frame->format == AV_PIX_FMT_CUDA);
    // Push FFmpeg's CUDA context to use for our CUDA operations
-    err = funcs->cuCtxPushCurrent(devCtx->cuda_ctx);
+    err = m_Funcs->cuCtxPushCurrent(m_Context->cuda_ctx);
    if (err != CUDA_SUCCESS) {
        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuCtxPushCurrent() failed: %d", err);
        return false;
    }
-    // NV12 has 2 planes
+    // Register each plane as a separate resource
-    for (int i = 0; i < 2; i++) {
+    for (int i = 0; i < NV12_PLANES; i++) {
        CUgraphicsResource cudaResource;
        CUarray cudaArray;
        GLint tex;
        // Get the ID of this plane's texture
@ -99,29 +105,86 @@ bool CUDARenderer::copyCudaFrameToBoundTexture(AVFrame* frame)
        glGetIntegerv(GL_TEXTURE_BINDING_2D, &tex);
        // Register it with CUDA
-        err = funcs->cuGraphicsGLRegisterImage(&cudaResource, tex, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD);
+        err = m_Funcs->cuGraphicsGLRegisterImage(&m_Resources[i], tex, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD);
        if (err != CUDA_SUCCESS) {
            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsGLRegisterImage() failed: %d", err);
            m_Resources[i] = 0;
            unregisterTextures();
            goto Exit;
        }
    }
-        // Map it to allow us to use it as a copy destination
+Exit:
-        err = funcs->cuGraphicsMapResources(1, &cudaResource, devCtx->stream);
+    {
-        if (err != CUDA_SUCCESS) {
+        CUcontext dummy;
-            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsMapResources() failed: %d", err);
+        m_Funcs->cuCtxPopCurrent(&dummy);
-            funcs->cuGraphicsUnregisterResource(cudaResource);
+    }
-            goto Exit;
+    return err == CUDA_SUCCESS;
 }
 void CUDAGLInteropHelper::unregisterTextures()
 {
    int err;
    if (m_Funcs == nullptr) {
        // Already logged in constructor
        return;
    }
    // Push FFmpeg's CUDA context to use for our CUDA operations
    err = m_Funcs->cuCtxPushCurrent(m_Context->cuda_ctx);
    if (err != CUDA_SUCCESS) {
        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuCtxPushCurrent() failed: %d", err);
        return;
    }
    for (int i = 0; i < NV12_PLANES; i++) {
        if (m_Resources[i] != 0) {
            m_Funcs->cuGraphicsUnregisterResource(m_Resources[i]);
            m_Resources[i] = 0;
        }
    }
-        // Get a pointer to the mapped array
+    {
-        err = funcs->cuGraphicsSubResourceGetMappedArray(&cudaArray, cudaResource, 0, 0);
+        CUcontext dummy;
        m_Funcs->cuCtxPopCurrent(&dummy);
    }
 }
 bool CUDAGLInteropHelper::copyCudaFrameToTextures(AVFrame* frame)
 {
    int err;
    if (m_Funcs == nullptr) {
        // Already logged in constructor
        return false;
    }
    // Push FFmpeg's CUDA context to use for our CUDA operations
    err = m_Funcs->cuCtxPushCurrent(m_Context->cuda_ctx);
    if (err != CUDA_SUCCESS) {
        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuCtxPushCurrent() failed: %d", err);
        return false;
    }
    // Map our resources
    err = m_Funcs->cuGraphicsMapResources(NV12_PLANES, m_Resources, m_Context->stream);
    if (err != CUDA_SUCCESS) {
        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsMapResources() failed: %d", err);
        goto PopCtxExit;
    }
    for (int i = 0; i < NV12_PLANES; i++) {
        CUarray cudaArray;
        // Get a pointer to the mapped array for this plane
        err = m_Funcs->cuGraphicsSubResourceGetMappedArray(&cudaArray, m_Resources[i], 0, 0);
        if (err != CUDA_SUCCESS) {
            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsSubResourceGetMappedArray() failed: %d", err);
-            funcs->cuGraphicsUnmapResources(1, &cudaResource, devCtx->stream);
+            goto UnmapExit;
            funcs->cuGraphicsUnregisterResource(cudaResource);
            goto Exit;
        }
        // Do the copy
        CUDA_MEMCPY2D cu2d = {
            .srcMemoryType = CU_MEMORYTYPE_DEVICE,
            .srcDevice = (CUdeviceptr)frame->data[i],
@ -132,27 +195,19 @@ bool CUDARenderer::copyCudaFrameToBoundTexture(AVFrame* frame)
            .WidthInBytes = (size_t)frame->width,
            .Height = (size_t)frame->height >> i
        };
-
+        err = m_Funcs->cuMemcpy2D(&cu2d);
        // Do the copy
        err = funcs->cuMemcpy2D(&cu2d);
        if (err != CUDA_SUCCESS) {
            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuMemcpy2D() failed: %d", err);
-            funcs->cuGraphicsUnmapResources(1, &cudaResource, devCtx->stream);
+            goto UnmapExit;
            funcs->cuGraphicsUnregisterResource(cudaResource);
            goto Exit;
        }
        funcs->cuGraphicsUnmapResources(1, &cudaResource, devCtx->stream);
        funcs->cuGraphicsUnregisterResource(cudaResource);
    }
-    ret = true;
+UnmapExit:
-
+    m_Funcs->cuGraphicsUnmapResources(NV12_PLANES, m_Resources, m_Context->stream);
-Exit:
+PopCtxExit:
    {
        CUcontext dummy;
-        funcs->cuCtxPopCurrent(&dummy);
+        m_Funcs->cuCtxPopCurrent(&dummy);
    }
-    return ret;
+    return err == CUDA_SUCCESS;
 }
--- a/app/streaming/video/ffmpeg-renderers/cuda.h
+++ b/app/streaming/video/ffmpeg-renderers/cuda.h
@ -2,6 +2,12 @@
 #include "renderer.h"
 #include <ffnvcodec/dynlink_loader.h>
 extern "C" {
    #include <libavutil/hwcontext_cuda.h>
 }
 class CUDARenderer : public IFFmpegRenderer {
 public:
    CUDARenderer();
@ -12,10 +18,25 @@ public:
    virtual bool needsTestFrame() override;
    virtual bool isDirectRenderingSupported() override;
    // Helper function used by SDLRenderer to read our CUDA frame
    static bool copyCudaFrameToBoundTexture(AVFrame* frame);
 private:
    AVBufferRef* m_HwContext;
 };
 #define NV12_PLANES 2
 // Helper class used by SDLRenderer to read our CUDA frame
 class CUDAGLInteropHelper {
 public:
    CUDAGLInteropHelper(AVHWDeviceContext* context);
    ~CUDAGLInteropHelper();
    bool registerBoundTextures();
    void unregisterTextures();
    bool copyCudaFrameToTextures(AVFrame* frame);
 private:
    CudaFunctions* m_Funcs;
    AVCUDADeviceContext* m_Context;
    CUgraphicsResource m_Resources[NV12_PLANES];
 };
--- a/app/streaming/video/ffmpeg-renderers/sdlvid.cpp
+++ b/app/streaming/video/ffmpeg-renderers/sdlvid.cpp
@ -5,20 +5,26 @@
 #include <Limelight.h>
 #ifdef HAVE_CUDA
 #include "cuda.h"
 #endif
 SdlRenderer::SdlRenderer()
    : m_Renderer(nullptr),
      m_Texture(nullptr),
      m_SwPixelFormat(AV_PIX_FMT_NONE)
 {
    SDL_zero(m_OverlayTextures);
 #ifdef HAVE_CUDA
    m_CudaGLHelper = nullptr;
 #endif
 }
 SdlRenderer::~SdlRenderer()
 {
 #ifdef HAVE_CUDA
    if (m_CudaGLHelper != nullptr) {
        delete m_CudaGLHelper;
    }
 #endif
    for (int i = 0; i < Overlay::OverlayMax; i++) {
        if (m_OverlayTextures[i] != nullptr) {
            SDL_DestroyTexture(m_OverlayTextures[i]);
@ -208,6 +214,7 @@ void SdlRenderer::renderFrame(AVFrame* frame)
    }
    if (frame->hw_frames_ctx != nullptr && frame->format != AV_PIX_FMT_CUDA) {
 ReadbackRetry:
        // If we are acting as the frontend for a hardware
        // accelerated decoder, we'll need to read the frame
        // back to render it.
@ -293,13 +300,28 @@ void SdlRenderer::renderFrame(AVFrame* frame)
                         SDL_GetError());
            goto Exit;
        }
 #ifdef HAVE_CUDA
        if (frame->format == AV_PIX_FMT_CUDA) {
            SDL_assert(m_CudaGLHelper == nullptr);
            m_CudaGLHelper = new CUDAGLInteropHelper(((AVHWFramesContext*)frame->hw_frames_ctx->data)->device_ctx);
            SDL_GL_BindTexture(m_Texture, nullptr, nullptr);
            if (!m_CudaGLHelper->registerBoundTextures()) {
                // If we can't register textures, fall back to normal read-back rendering
                delete m_CudaGLHelper;
                m_CudaGLHelper = nullptr;
            }
            SDL_GL_UnbindTexture(m_Texture);
        }
 #endif
    }
    if (frame->format == AV_PIX_FMT_CUDA) {
 #ifdef HAVE_CUDA
-        SDL_GL_BindTexture(m_Texture, nullptr, nullptr);
+        if (m_CudaGLHelper == nullptr || !m_CudaGLHelper->copyCudaFrameToTextures(frame)) {
-        CUDARenderer::copyCudaFrameToBoundTexture(frame);
+            goto ReadbackRetry;
-        SDL_GL_UnbindTexture(m_Texture);
+        }
 #else
        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION,
                     "Got CUDA frame, but not built with CUDA support!");
--- a/app/streaming/video/ffmpeg-renderers/sdlvid.h
+++ b/app/streaming/video/ffmpeg-renderers/sdlvid.h
@ -2,6 +2,10 @@
 #include "renderer.h"
 #ifdef HAVE_CUDA
 #include "cuda.h"
 #endif
 class SdlRenderer : public IFFmpegRenderer {
 public:
    SdlRenderer();
@ -21,5 +25,9 @@ private:
    int m_SwPixelFormat;
    SDL_Texture* m_OverlayTextures[Overlay::OverlayMax];
    SDL_Rect m_OverlayRects[Overlay::OverlayMax];
 #ifdef HAVE_CUDA
    CUDAGLInteropHelper* m_CudaGLHelper;
 #endif
 };