From 36e0468a993778b534076c2e44c31739894eafb3 Mon Sep 17 00:00:00 2001
From: Cameron Gutman <aicommander@gmail.com>
Date: Mon, 13 Dec 2021 21:41:00 -0600
Subject: [PATCH] Optimize CUDA GL interop and provide fallback

---
 app/streaming/video/ffmpeg-renderers/cuda.cpp | 159 ++++++++++++------
 app/streaming/video/ffmpeg-renderers/cuda.h   |  27 ++-
 .../video/ffmpeg-renderers/sdlvid.cpp         |  36 +++-
 app/streaming/video/ffmpeg-renderers/sdlvid.h |   8 +
 4 files changed, 168 insertions(+), 62 deletions(-)

diff --git a/app/streaming/video/ffmpeg-renderers/cuda.cpp b/app/streaming/video/ffmpeg-renderers/cuda.cpp
index c1679301..035a51b9 100644
--- a/app/streaming/video/ffmpeg-renderers/cuda.cpp
+++ b/app/streaming/video/ffmpeg-renderers/cuda.cpp
@@ -1,13 +1,7 @@
 #include "cuda.h"
 
-#include <ffnvcodec/dynlink_loader.h>
-
 #include <SDL_opengl.h>
 
-extern "C" {
-    #include <libavutil/hwcontext_cuda.h>
-}
-
 CUDARenderer::CUDARenderer()
     : m_HwContext(nullptr)
 {
@@ -63,35 +57,47 @@ bool CUDARenderer::isDirectRenderingSupported()
     return false;
 }
 
-bool CUDARenderer::copyCudaFrameToBoundTexture(AVFrame* frame)
+CUDAGLInteropHelper::CUDAGLInteropHelper(AVHWDeviceContext* context)
+    : m_Funcs(nullptr),
+      m_Context((AVCUDADeviceContext*)context->hwctx)
 {
-    static CudaFunctions* funcs;
-    CUresult err;
-    AVCUDADeviceContext* devCtx = (AVCUDADeviceContext*)(((AVHWFramesContext*)frame->hw_frames_ctx->data)->device_ctx->hwctx);
-    bool ret = false;
+    memset(m_Resources, 0, sizeof(m_Resources));
 
-    if (!funcs) {
-        // One-time init of CUDA library
-        cuda_load_functions(&funcs, nullptr);
-        if (!funcs) {
-            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Failed to initialize CUDA library");
-            return false;
-        }
+    // One-time init of CUDA library
+    cuda_load_functions(&m_Funcs, nullptr);
+    if (m_Funcs == nullptr) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Failed to initialize CUDA library");
+        return;
+    }
+}
+
+CUDAGLInteropHelper::~CUDAGLInteropHelper()
+{
+    unregisterTextures();
+
+    if (m_Funcs != nullptr) {
+        cuda_free_functions(&m_Funcs);
+    }
+}
+
+bool CUDAGLInteropHelper::registerBoundTextures()
+{
+    int err;
+
+    if (m_Funcs == nullptr) {
+        // Already logged in constructor
+        return false;
     }
 
-    SDL_assert(frame->format == AV_PIX_FMT_CUDA);
-
     // Push FFmpeg's CUDA context to use for our CUDA operations
-    err = funcs->cuCtxPushCurrent(devCtx->cuda_ctx);
+    err = m_Funcs->cuCtxPushCurrent(m_Context->cuda_ctx);
     if (err != CUDA_SUCCESS) {
         SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuCtxPushCurrent() failed: %d", err);
         return false;
     }
 
-    // NV12 has 2 planes
-    for (int i = 0; i < 2; i++) {
-        CUgraphicsResource cudaResource;
-        CUarray cudaArray;
+    // Register each plane as a separate resource
+    for (int i = 0; i < NV12_PLANES; i++) {
         GLint tex;
 
         // Get the ID of this plane's texture
@@ -99,29 +105,86 @@ bool CUDARenderer::copyCudaFrameToBoundTexture(AVFrame* frame)
         glGetIntegerv(GL_TEXTURE_BINDING_2D, &tex);
 
         // Register it with CUDA
-        err = funcs->cuGraphicsGLRegisterImage(&cudaResource, tex, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD);
+        err = m_Funcs->cuGraphicsGLRegisterImage(&m_Resources[i], tex, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD);
         if (err != CUDA_SUCCESS) {
             SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsGLRegisterImage() failed: %d", err);
+            m_Resources[i] = 0;
+            unregisterTextures();
             goto Exit;
         }
+    }
 
-        // Map it to allow us to use it as a copy destination
-        err = funcs->cuGraphicsMapResources(1, &cudaResource, devCtx->stream);
-        if (err != CUDA_SUCCESS) {
-            SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsMapResources() failed: %d", err);
-            funcs->cuGraphicsUnregisterResource(cudaResource);
-            goto Exit;
+Exit:
+    {
+        CUcontext dummy;
+        m_Funcs->cuCtxPopCurrent(&dummy);
+    }
+    return err == CUDA_SUCCESS;
+}
+
+void CUDAGLInteropHelper::unregisterTextures()
+{
+    int err;
+
+    if (m_Funcs == nullptr) {
+        // Already logged in constructor
+        return;
+    }
+
+    // Push FFmpeg's CUDA context to use for our CUDA operations
+    err = m_Funcs->cuCtxPushCurrent(m_Context->cuda_ctx);
+    if (err != CUDA_SUCCESS) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuCtxPushCurrent() failed: %d", err);
+        return;
+    }
+
+    for (int i = 0; i < NV12_PLANES; i++) {
+        if (m_Resources[i] != 0) {
+            m_Funcs->cuGraphicsUnregisterResource(m_Resources[i]);
+            m_Resources[i] = 0;
         }
+    }
 
-        // Get a pointer to the mapped array
-        err = funcs->cuGraphicsSubResourceGetMappedArray(&cudaArray, cudaResource, 0, 0);
+    {
+        CUcontext dummy;
+        m_Funcs->cuCtxPopCurrent(&dummy);
+    }
+}
+
+bool CUDAGLInteropHelper::copyCudaFrameToTextures(AVFrame* frame)
+{
+    int err;
+
+    if (m_Funcs == nullptr) {
+        // Already logged in constructor
+        return false;
+    }
+
+    // Push FFmpeg's CUDA context to use for our CUDA operations
+    err = m_Funcs->cuCtxPushCurrent(m_Context->cuda_ctx);
+    if (err != CUDA_SUCCESS) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuCtxPushCurrent() failed: %d", err);
+        return false;
+    }
+
+    // Map our resources
+    err = m_Funcs->cuGraphicsMapResources(NV12_PLANES, m_Resources, m_Context->stream);
+    if (err != CUDA_SUCCESS) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsMapResources() failed: %d", err);
+        goto PopCtxExit;
+    }
+
+    for (int i = 0; i < NV12_PLANES; i++) {
+        CUarray cudaArray;
+
+        // Get a pointer to the mapped array for this plane
+        err = m_Funcs->cuGraphicsSubResourceGetMappedArray(&cudaArray, m_Resources[i], 0, 0);
         if (err != CUDA_SUCCESS) {
             SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuGraphicsSubResourceGetMappedArray() failed: %d", err);
-            funcs->cuGraphicsUnmapResources(1, &cudaResource, devCtx->stream);
-            funcs->cuGraphicsUnregisterResource(cudaResource);
-            goto Exit;
+            goto UnmapExit;
         }
 
+        // Do the copy
         CUDA_MEMCPY2D cu2d = {
             .srcMemoryType = CU_MEMORYTYPE_DEVICE,
             .srcDevice = (CUdeviceptr)frame->data[i],
@@ -132,27 +195,19 @@ bool CUDARenderer::copyCudaFrameToBoundTexture(AVFrame* frame)
             .WidthInBytes = (size_t)frame->width,
             .Height = (size_t)frame->height >> i
         };
-
-        // Do the copy
-        err = funcs->cuMemcpy2D(&cu2d);
+        err = m_Funcs->cuMemcpy2D(&cu2d);
         if (err != CUDA_SUCCESS) {
             SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "cuMemcpy2D() failed: %d", err);
-            funcs->cuGraphicsUnmapResources(1, &cudaResource, devCtx->stream);
-            funcs->cuGraphicsUnregisterResource(cudaResource);
-            goto Exit;
+            goto UnmapExit;
         }
-
-        funcs->cuGraphicsUnmapResources(1, &cudaResource, devCtx->stream);
-        funcs->cuGraphicsUnregisterResource(cudaResource);
     }
 
-    ret = true;
-
-Exit:
+UnmapExit:
+    m_Funcs->cuGraphicsUnmapResources(NV12_PLANES, m_Resources, m_Context->stream);
+PopCtxExit:
     {
         CUcontext dummy;
-        funcs->cuCtxPopCurrent(&dummy);
+        m_Funcs->cuCtxPopCurrent(&dummy);
     }
-    return ret;
+    return err == CUDA_SUCCESS;
 }
-
diff --git a/app/streaming/video/ffmpeg-renderers/cuda.h b/app/streaming/video/ffmpeg-renderers/cuda.h
index 7a0f5af7..086abcbf 100644
--- a/app/streaming/video/ffmpeg-renderers/cuda.h
+++ b/app/streaming/video/ffmpeg-renderers/cuda.h
@@ -2,6 +2,12 @@
 
 #include "renderer.h"
 
+#include <ffnvcodec/dynlink_loader.h>
+
+extern "C" {
+    #include <libavutil/hwcontext_cuda.h>
+}
+
 class CUDARenderer : public IFFmpegRenderer {
 public:
     CUDARenderer();
@@ -12,10 +18,25 @@ public:
     virtual bool needsTestFrame() override;
     virtual bool isDirectRenderingSupported() override;
 
-    // Helper function used by SDLRenderer to read our CUDA frame
-    static bool copyCudaFrameToBoundTexture(AVFrame* frame);
-
 private:
     AVBufferRef* m_HwContext;
 };
 
+#define NV12_PLANES 2
+
+// Helper class used by SDLRenderer to read our CUDA frame
+class CUDAGLInteropHelper {
+public:
+    CUDAGLInteropHelper(AVHWDeviceContext* context);
+    ~CUDAGLInteropHelper();
+
+    bool registerBoundTextures();
+    void unregisterTextures();
+
+    bool copyCudaFrameToTextures(AVFrame* frame);
+
+private:
+    CudaFunctions* m_Funcs;
+    AVCUDADeviceContext* m_Context;
+    CUgraphicsResource m_Resources[NV12_PLANES];
+};
diff --git a/app/streaming/video/ffmpeg-renderers/sdlvid.cpp b/app/streaming/video/ffmpeg-renderers/sdlvid.cpp
index 3048d8d2..27f4f0bb 100644
--- a/app/streaming/video/ffmpeg-renderers/sdlvid.cpp
+++ b/app/streaming/video/ffmpeg-renderers/sdlvid.cpp
@@ -5,20 +5,26 @@
 
 #include <Limelight.h>
 
-#ifdef HAVE_CUDA
-#include "cuda.h"
-#endif
-
 SdlRenderer::SdlRenderer()
     : m_Renderer(nullptr),
       m_Texture(nullptr),
       m_SwPixelFormat(AV_PIX_FMT_NONE)
 {
     SDL_zero(m_OverlayTextures);
+
+#ifdef HAVE_CUDA
+    m_CudaGLHelper = nullptr;
+#endif
 }
 
 SdlRenderer::~SdlRenderer()
 {
+#ifdef HAVE_CUDA
+    if (m_CudaGLHelper != nullptr) {
+        delete m_CudaGLHelper;
+    }
+#endif
+
     for (int i = 0; i < Overlay::OverlayMax; i++) {
         if (m_OverlayTextures[i] != nullptr) {
             SDL_DestroyTexture(m_OverlayTextures[i]);
@@ -208,6 +214,7 @@ void SdlRenderer::renderFrame(AVFrame* frame)
     }
 
     if (frame->hw_frames_ctx != nullptr && frame->format != AV_PIX_FMT_CUDA) {
+ReadbackRetry:
         // If we are acting as the frontend for a hardware
         // accelerated decoder, we'll need to read the frame
         // back to render it.
@@ -293,13 +300,28 @@ void SdlRenderer::renderFrame(AVFrame* frame)
                          SDL_GetError());
             goto Exit;
         }
+
+#ifdef HAVE_CUDA
+        if (frame->format == AV_PIX_FMT_CUDA) {
+            SDL_assert(m_CudaGLHelper == nullptr);
+            m_CudaGLHelper = new CUDAGLInteropHelper(((AVHWFramesContext*)frame->hw_frames_ctx->data)->device_ctx);
+
+            SDL_GL_BindTexture(m_Texture, nullptr, nullptr);
+            if (!m_CudaGLHelper->registerBoundTextures()) {
+                // If we can't register textures, fall back to normal read-back rendering
+                delete m_CudaGLHelper;
+                m_CudaGLHelper = nullptr;
+            }
+            SDL_GL_UnbindTexture(m_Texture);
+        }
+#endif
     }
 
     if (frame->format == AV_PIX_FMT_CUDA) {
 #ifdef HAVE_CUDA
-        SDL_GL_BindTexture(m_Texture, nullptr, nullptr);
-        CUDARenderer::copyCudaFrameToBoundTexture(frame);
-        SDL_GL_UnbindTexture(m_Texture);
+        if (m_CudaGLHelper == nullptr || !m_CudaGLHelper->copyCudaFrameToTextures(frame)) {
+            goto ReadbackRetry;
+        }
 #else
         SDL_LogError(SDL_LOG_CATEGORY_APPLICATION,
                      "Got CUDA frame, but not built with CUDA support!");
diff --git a/app/streaming/video/ffmpeg-renderers/sdlvid.h b/app/streaming/video/ffmpeg-renderers/sdlvid.h
index 819435c3..9c827ce6 100644
--- a/app/streaming/video/ffmpeg-renderers/sdlvid.h
+++ b/app/streaming/video/ffmpeg-renderers/sdlvid.h
@@ -2,6 +2,10 @@
 
 #include "renderer.h"
 
+#ifdef HAVE_CUDA
+#include "cuda.h"
+#endif
+
 class SdlRenderer : public IFFmpegRenderer {
 public:
     SdlRenderer();
@@ -21,5 +25,9 @@ private:
     int m_SwPixelFormat;
     SDL_Texture* m_OverlayTextures[Overlay::OverlayMax];
     SDL_Rect m_OverlayRects[Overlay::OverlayMax];
+
+#ifdef HAVE_CUDA
+    CUDAGLInteropHelper* m_CudaGLHelper;
+#endif
 };