From 8ba814efb295f0b8494b3679c484c7ceab31c392 Mon Sep 17 00:00:00 2001
From: ameerj <52414509+ameerj@users.noreply.github.com>
Date: Fri, 28 May 2021 21:24:52 -0400
Subject: [PATCH] glsl: Better Storage access and wip warps

---
 .../backend/glsl/emit_context.cpp             |  4 ++
 .../backend/glsl/emit_glsl.cpp                |  5 +-
 .../glsl/emit_glsl_context_get_set.cpp        | 16 +++--
 .../backend/glsl/emit_glsl_instructions.h     | 26 +++++---
 .../backend/glsl/emit_glsl_integer.cpp        |  4 +-
 .../backend/glsl/emit_glsl_memory.cpp         | 60 +++++++++++++------
 .../glsl/emit_glsl_not_implemented.cpp        | 27 ---------
 .../backend/glsl/emit_glsl_warp.cpp           | 53 ++++++++++++++++
 8 files changed, 133 insertions(+), 62 deletions(-)

diff --git a/src/shader_recompiler/backend/glsl/emit_context.cpp b/src/shader_recompiler/backend/glsl/emit_context.cpp
index 3530e89e5..db62ba73b 100644
--- a/src/shader_recompiler/backend/glsl/emit_context.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_context.cpp
@@ -122,6 +122,10 @@ void EmitContext::SetupExtensions(std::string&) {
             header += "#extension GL_AMD_gpu_shader_half_float : enable\n";
         }
     }
+    if (info.uses_subgroup_invocation_id || info.uses_subgroup_mask || info.uses_subgroup_vote ||
+        info.uses_subgroup_shuffles || info.uses_fswzadd) {
+        header += "#extension GL_ARB_shader_ballot : enable\n";
+    }
 }
 
 void EmitContext::DefineConstantBuffers(Bindings& bindings) {
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl.cpp b/src/shader_recompiler/backend/glsl/emit_glsl.cpp
index 992e4b82e..800de58b7 100644
--- a/src/shader_recompiler/backend/glsl/emit_glsl.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_glsl.cpp
@@ -183,8 +183,11 @@ std::string EmitGLSL(const Profile& profile, const RuntimeInfo& runtime_info, IR
     for (size_t index = 0; index < ctx.reg_alloc.num_used_registers; ++index) {
         ctx.header += fmt::format("{} R{};", ctx.reg_alloc.reg_types[index], index);
     }
-    // TODO: track CC usage
+    // TODO: track usage
     ctx.header += "uint carry;";
+    if (program.info.uses_subgroup_shuffles) {
+        ctx.header += "bool shfl_in_bounds;\n";
+    }
     ctx.code.insert(0, ctx.header);
     ctx.code += "}";
     fmt::print("\n{}\n", ctx.code);
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp
index 67d308c49..2286177a7 100644
--- a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp
@@ -20,22 +20,26 @@ char OffsetSwizzle(u32 offset) {
 }
 } // namespace
 
-void EmitGetCbufU8([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] const IR::Value& binding,
+void EmitGetCbufU8([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst& inst,
+                   [[maybe_unused]] const IR::Value& binding,
                    [[maybe_unused]] const IR::Value& offset) {
     throw NotImplementedException("GLSL");
 }
 
-void EmitGetCbufS8([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] const IR::Value& binding,
+void EmitGetCbufS8([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst& inst,
+                   [[maybe_unused]] const IR::Value& binding,
                    [[maybe_unused]] const IR::Value& offset) {
     throw NotImplementedException("GLSL");
 }
 
-void EmitGetCbufU16([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] const IR::Value& binding,
+void EmitGetCbufU16([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst& inst,
+                    [[maybe_unused]] const IR::Value& binding,
                     [[maybe_unused]] const IR::Value& offset) {
     throw NotImplementedException("GLSL");
 }
 
-void EmitGetCbufS16([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] const IR::Value& binding,
+void EmitGetCbufS16([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst& inst,
+                    [[maybe_unused]] const IR::Value& binding,
                     [[maybe_unused]] const IR::Value& offset) {
     throw NotImplementedException("GLSL");
 }
@@ -151,4 +155,8 @@ void EmitSetFragColor(EmitContext& ctx, u32 index, u32 component, std::string_vi
     ctx.Add("frag_color{}.{}={};", index, swizzle, value);
 }
 
+void EmitLocalInvocationId(EmitContext& ctx, IR::Inst& inst) {
+    ctx.AddU32x3("{}=gl_LocalInvocationID;", inst);
+}
+
 } // namespace Shader::Backend::GLSL
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h b/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h
index b54fe684e..07408d9e9 100644
--- a/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h
@@ -52,10 +52,14 @@ void EmitSetGotoVariable(EmitContext& ctx);
 void EmitGetGotoVariable(EmitContext& ctx);
 void EmitSetIndirectBranchVariable(EmitContext& ctx);
 void EmitGetIndirectBranchVariable(EmitContext& ctx);
-void EmitGetCbufU8(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset);
-void EmitGetCbufS8(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset);
-void EmitGetCbufU16(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset);
-void EmitGetCbufS16(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset);
+void EmitGetCbufU8(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                   const IR::Value& offset);
+void EmitGetCbufS8(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                   const IR::Value& offset);
+void EmitGetCbufU16(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                    const IR::Value& offset);
+void EmitGetCbufS16(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                    const IR::Value& offset);
 void EmitGetCbufU32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                     const IR::Value& offset);
 void EmitGetCbufF32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
@@ -83,7 +87,7 @@ void EmitSetSFlag(EmitContext& ctx);
 void EmitSetCFlag(EmitContext& ctx);
 void EmitSetOFlag(EmitContext& ctx);
 void EmitWorkgroupId(EmitContext& ctx);
-void EmitLocalInvocationId(EmitContext& ctx);
+void EmitLocalInvocationId(EmitContext& ctx, IR::Inst& inst);
 void EmitInvocationId(EmitContext& ctx);
 void EmitSampleId(EmitContext& ctx);
 void EmitIsHelperInvocation(EmitContext& ctx);
@@ -109,10 +113,14 @@ void EmitWriteGlobalS16(EmitContext& ctx);
 void EmitWriteGlobal32(EmitContext& ctx, std::string_view address, std::string_view value);
 void EmitWriteGlobal64(EmitContext& ctx, std::string_view address, std::string_view value);
 void EmitWriteGlobal128(EmitContext& ctx, std::string_view address, std::string_view value);
-void EmitLoadStorageU8(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset);
-void EmitLoadStorageS8(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset);
-void EmitLoadStorageU16(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset);
-void EmitLoadStorageS16(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset);
+void EmitLoadStorageU8(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                       const IR::Value& offset);
+void EmitLoadStorageS8(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                       const IR::Value& offset);
+void EmitLoadStorageU16(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                        const IR::Value& offset);
+void EmitLoadStorageS16(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                        const IR::Value& offset);
 void EmitLoadStorage32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                        const IR::Value& offset);
 void EmitLoadStorage64(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp
index 34f880f1b..07e1a4b51 100644
--- a/src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_integer.cpp
@@ -156,12 +156,12 @@ void EmitBitwiseNot32(EmitContext& ctx, IR::Inst& inst, std::string_view value)
 
 void EmitFindSMsb32([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst& inst,
                     [[maybe_unused]] std::string_view value) {
-    throw NotImplementedException("GLSL Instruction");
+    ctx.AddU32("{}=findMSB(int({}));", inst, value);
 }
 
 void EmitFindUMsb32([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst& inst,
                     [[maybe_unused]] std::string_view value) {
-    throw NotImplementedException("GLSL Instruction");
+    ctx.AddU32("{}=findMSB(uint({}));", inst, value);
 }
 
 void EmitSMin32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::string_view b) {
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_memory.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_memory.cpp
index 708c9685b..09ad35e44 100644
--- a/src/shader_recompiler/backend/glsl/emit_glsl_memory.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_memory.cpp
@@ -8,45 +8,55 @@
 #include "shader_recompiler/frontend/ir/value.h"
 
 namespace Shader::Backend::GLSL {
-void EmitLoadStorageU8([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] const IR::Value& binding,
+void EmitLoadStorageU8([[maybe_unused]] EmitContext& ctx, IR::Inst& inst,
+                       [[maybe_unused]] const IR::Value& binding,
                        [[maybe_unused]] const IR::Value& offset) {
-    throw NotImplementedException("GLSL Instrucion");
+    const auto offset_var{ctx.reg_alloc.Consume(offset)};
+    ctx.AddU32("{}=bitfieldExtract(ssbo{}[{}/4],int({}%4)*8,8);", inst, binding.U32(), offset_var,
+               offset_var);
 }
 
-void EmitLoadStorageS8([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] const IR::Value& binding,
+void EmitLoadStorageS8([[maybe_unused]] EmitContext& ctx, IR::Inst& inst,
+                       [[maybe_unused]] const IR::Value& binding,
                        [[maybe_unused]] const IR::Value& offset) {
-    throw NotImplementedException("GLSL Instrucion");
+    const auto offset_var{ctx.reg_alloc.Consume(offset)};
+    ctx.AddS32("{}=bitfieldExtract(int(ssbo{}[{}/4]),int({}%4)*8,8);", inst, binding.U32(),
+               offset_var, offset_var);
 }
 
-void EmitLoadStorageU16([[maybe_unused]] EmitContext& ctx,
+void EmitLoadStorageU16([[maybe_unused]] EmitContext& ctx, IR::Inst& inst,
                         [[maybe_unused]] const IR::Value& binding,
                         [[maybe_unused]] const IR::Value& offset) {
-    throw NotImplementedException("GLSL Instrucion");
+    const auto offset_var{ctx.reg_alloc.Consume(offset)};
+    ctx.AddU32("{}=bitfieldExtract(ssbo{}[{}/4],int(({}/2)%2)*16,16);", inst, binding.U32(),
+               offset_var, offset_var);
 }
 
-void EmitLoadStorageS16([[maybe_unused]] EmitContext& ctx,
+void EmitLoadStorageS16([[maybe_unused]] EmitContext& ctx, IR::Inst& inst,
                         [[maybe_unused]] const IR::Value& binding,
                         [[maybe_unused]] const IR::Value& offset) {
-    throw NotImplementedException("GLSL Instrucion");
+    const auto offset_var{ctx.reg_alloc.Consume(offset)};
+    ctx.AddS32("{}=bitfieldExtract(int(ssbo{}[{}/4]),int(({}/2)%2)*16,16);", inst, binding.U32(),
+               offset_var, offset_var);
 }
 
 void EmitLoadStorage32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                        const IR::Value& offset) {
     const auto offset_var{ctx.reg_alloc.Consume(offset)};
-    ctx.AddU32("{}=ssbo{}[{}];", inst, binding.U32(), offset_var);
+    ctx.AddU32("{}=ssbo{}[{}/4];", inst, binding.U32(), offset_var);
 }
 
 void EmitLoadStorage64(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                        const IR::Value& offset) {
     const auto offset_var{ctx.reg_alloc.Consume(offset)};
-    ctx.AddU32x2("{}=uvec2(ssbo{}[{}],ssbo{}[{}+1]);", inst, binding.U32(), offset_var,
+    ctx.AddU32x2("{}=uvec2(ssbo{}[{}/4],ssbo{}[{}/4+1]);", inst, binding.U32(), offset_var,
                  binding.U32(), offset_var);
 }
 
 void EmitLoadStorage128(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                         const IR::Value& offset) {
     const auto offset_var{ctx.reg_alloc.Consume(offset)};
-    ctx.AddU32x4("{}=uvec4(ssbo{}[{}],ssbo{}[{}+1],ssbo{}[{}+2],ssbo{}[{}+3]);", inst,
+    ctx.AddU32x4("{}=uvec4(ssbo{}[{}/4],ssbo{}[{}/4+1],ssbo{}[{}/4+2],ssbo{}[{}/4+3]);", inst,
                  binding.U32(), offset_var, binding.U32(), offset_var, binding.U32(), offset_var,
                  binding.U32(), offset_var);
 }
@@ -55,47 +65,59 @@ void EmitWriteStorageU8([[maybe_unused]] EmitContext& ctx,
                         [[maybe_unused]] const IR::Value& binding,
                         [[maybe_unused]] const IR::Value& offset,
                         [[maybe_unused]] std::string_view value) {
-    throw NotImplementedException("GLSL Instrucion");
+    const auto offset_var{ctx.reg_alloc.Consume(offset)};
+    ctx.Add("ssbo{}[{}/4]=bitfieldInsert(ssbo{}[{}/4],{},int({}%4)*8,8);", binding.U32(),
+            offset_var, binding.U32(), offset_var, value, offset_var);
 }
 
 void EmitWriteStorageS8([[maybe_unused]] EmitContext& ctx,
                         [[maybe_unused]] const IR::Value& binding,
                         [[maybe_unused]] const IR::Value& offset,
                         [[maybe_unused]] std::string_view value) {
-    throw NotImplementedException("GLSL Instrucion");
+    const auto offset_var{ctx.reg_alloc.Consume(offset)};
+    ctx.Add("ssbo{}[{}/4]=bitfieldInsert(ssbo{}[{}/4],{},int({}%4)*8,8);", binding.U32(),
+            offset_var, binding.U32(), offset_var, value, offset_var);
 }
 
 void EmitWriteStorageU16([[maybe_unused]] EmitContext& ctx,
                          [[maybe_unused]] const IR::Value& binding,
                          [[maybe_unused]] const IR::Value& offset,
                          [[maybe_unused]] std::string_view value) {
-    throw NotImplementedException("GLSL Instrucion");
+    const auto offset_var{ctx.reg_alloc.Consume(offset)};
+    ctx.Add("ssbo{}[{}/4]=bitfieldInsert(ssbo{}[{}/4],{},int(({}/2)%2)*16,16);", binding.U32(),
+            offset_var, binding.U32(), offset_var, value, offset_var);
 }
 
 void EmitWriteStorageS16([[maybe_unused]] EmitContext& ctx,
                          [[maybe_unused]] const IR::Value& binding,
                          [[maybe_unused]] const IR::Value& offset,
                          [[maybe_unused]] std::string_view value) {
-    throw NotImplementedException("GLSL Instrucion");
+    const auto offset_var{ctx.reg_alloc.Consume(offset)};
+    ctx.Add("ssbo{}[{}/4]=bitfieldInsert(ssbo{}[{}/4],{},int(({}/2)%2)*16,16);", binding.U32(),
+            offset_var, binding.U32(), offset_var, value, offset_var);
 }
 
 void EmitWriteStorage32(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
                         std::string_view value) {
     const auto offset_var{ctx.reg_alloc.Consume(offset)};
-    ctx.Add("ssbo{}[{}]={};", binding.U32(), offset_var, value);
+    ctx.Add("ssbo{}[{}/4]={};", binding.U32(), offset_var, value);
 }
 
 void EmitWriteStorage64(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
                         std::string_view value) {
     const auto offset_var{ctx.reg_alloc.Consume(offset)};
-    ctx.Add("ssbo{}[{}]={}.x;", binding.U32(), offset_var, value);
-    ctx.Add("ssbo{}[{}+1]={}.y;", binding.U32(), offset_var, value);
+    ctx.Add("ssbo{}[{}/4]={}.x;", binding.U32(), offset_var, value);
+    ctx.Add("ssbo{}[({}/4)+1]={}.y;", binding.U32(), offset_var, value);
 }
 
 void EmitWriteStorage128([[maybe_unused]] EmitContext& ctx,
                          [[maybe_unused]] const IR::Value& binding,
                          [[maybe_unused]] const IR::Value& offset,
                          [[maybe_unused]] std::string_view value) {
-    throw NotImplementedException("GLSL Instrucion");
+    const auto offset_var{ctx.reg_alloc.Consume(offset)};
+    ctx.Add("ssbo{}[{}/4]={}.x;", binding.U32(), offset_var, value);
+    ctx.Add("ssbo{}[({}/4)+1]={}.y;", binding.U32(), offset_var, value);
+    ctx.Add("ssbo{}[({}/4)+2]={}.z;", binding.U32(), offset_var, value);
+    ctx.Add("ssbo{}[({}/4)+3]={}.w;", binding.U32(), offset_var, value);
 }
 } // namespace Shader::Backend::GLSL
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_not_implemented.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_not_implemented.cpp
index 3bac8899b..ec80f3cef 100644
--- a/src/shader_recompiler/backend/glsl/emit_glsl_not_implemented.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_not_implemented.cpp
@@ -206,10 +206,6 @@ void EmitWorkgroupId(EmitContext& ctx) {
     NotImplemented();
 }
 
-void EmitLocalInvocationId(EmitContext& ctx) {
-    NotImplemented();
-}
-
 void EmitInvocationId(EmitContext& ctx) {
     NotImplemented();
 }
@@ -626,27 +622,4 @@ void EmitSubgroupGeMask(EmitContext& ctx) {
     NotImplemented();
 }
 
-void EmitShuffleIndex(EmitContext& ctx, IR::Inst& inst, std::string_view value,
-                      std::string_view index, std::string_view clamp,
-                      std::string_view segmentation_mask) {
-    NotImplemented();
-}
-
-void EmitShuffleUp(EmitContext& ctx, IR::Inst& inst, std::string_view value, std::string_view index,
-                   std::string_view clamp, std::string_view segmentation_mask) {
-    NotImplemented();
-}
-
-void EmitShuffleDown(EmitContext& ctx, IR::Inst& inst, std::string_view value,
-                     std::string_view index, std::string_view clamp,
-                     std::string_view segmentation_mask) {
-    NotImplemented();
-}
-
-void EmitShuffleButterfly(EmitContext& ctx, IR::Inst& inst, std::string_view value,
-                          std::string_view index, std::string_view clamp,
-                          std::string_view segmentation_mask) {
-    NotImplemented();
-}
-
 } // namespace Shader::Backend::GLSL
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp
index aebdf8a3a..0a488188b 100644
--- a/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp
@@ -8,6 +8,59 @@
 #include "shader_recompiler/frontend/ir/value.h"
 
 namespace Shader::Backend::GLSL {
+namespace {
+void SetInBoundsFlag(EmitContext& ctx, IR::Inst& inst) {
+    IR::Inst* const in_bounds{inst.GetAssociatedPseudoOperation(IR::Opcode::GetInBoundsFromOp)};
+    if (!in_bounds) {
+        return;
+    }
+
+    ctx.AddU1("{}=shfl_in_bounds;", *in_bounds);
+    in_bounds->Invalidate();
+}
+} // namespace
+
+void EmitShuffleIndex(EmitContext& ctx, IR::Inst& inst, std::string_view value,
+                      std::string_view index, std::string_view clamp,
+                      std::string_view segmentation_mask) {
+    ctx.Add("shfl_in_bounds=int(gl_SubGroupInvocationARB-{})>=int((gl_SubGroupInvocationARB&{})|({}"
+            "&~{}));",
+            index, segmentation_mask, clamp, segmentation_mask);
+    SetInBoundsFlag(ctx, inst);
+    ctx.AddU32("{}=shfl_in_bounds?{}:gl_SubGroupInvocationARB-{};", inst, value, index);
+}
+
+void EmitShuffleUp(EmitContext& ctx, IR::Inst& inst, std::string_view value, std::string_view index,
+                   std::string_view clamp, std::string_view segmentation_mask) {
+    ctx.Add("shfl_in_bounds=int(gl_SubGroupInvocationARB-{})>=int((gl_SubGroupInvocationARB&{})|({}"
+            "&~{}));",
+            index, segmentation_mask, clamp, segmentation_mask);
+    SetInBoundsFlag(ctx, inst);
+    ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},gl_SubGroupInvocationARB-{}):"
+               "{};",
+               inst, value, index, value);
+}
+
+void EmitShuffleDown(EmitContext& ctx, IR::Inst& inst, std::string_view value,
+                     std::string_view index, std::string_view clamp,
+                     std::string_view segmentation_mask) {
+    ctx.Add("shfl_in_bounds=int(gl_SubGroupInvocationARB-{})>=int((gl_SubGroupInvocationARB&{})|({}"
+            "&~{}));",
+            index, segmentation_mask, clamp, segmentation_mask);
+    SetInBoundsFlag(ctx, inst);
+    ctx.AddU32("{}=shfl_in_bounds?{}:gl_SubGroupInvocationARB-{};", inst, value, index);
+}
+
+void EmitShuffleButterfly(EmitContext& ctx, IR::Inst& inst, std::string_view value,
+                          std::string_view index, std::string_view clamp,
+                          std::string_view segmentation_mask) {
+    ctx.Add("shfl_in_bounds=int(gl_SubGroupInvocationARB-{})>=int((gl_SubGroupInvocationARB&{})|({}"
+            "&~{}));",
+            index, segmentation_mask, clamp, segmentation_mask);
+    SetInBoundsFlag(ctx, inst);
+    ctx.AddU32("{}=shfl_in_bounds?{}:gl_SubGroupInvocationARB-{};", inst, value, index);
+}
+
 void EmitFSwizzleAdd([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst& inst,
                      [[maybe_unused]] std::string_view op_a, [[maybe_unused]] std::string_view op_b,
                      [[maybe_unused]] std::string_view swizzle) {