如何从 C# 执行 HLSL？答案

【问题标题】：How to execute HLSL from C#?如何从 C# 执行 HLSL？
【发布时间】：2020-11-26 14:32:34
【问题描述】：

所以我在 Visual Studio 2019 社区中编写了一个 C# 程序，但是对于某些操作，我希望它在 GPU 而不是 CPU 上运行。
我对 HLSL 有一点经验，例如为一些 Unity 项目编写了一些计算着色器，但是我真的无法（通过 Google）找到任何从使用 Unity 之外的 C# 程序执行 HLSL 代码的方法。

假设我对内核一词的理解是正确的（以及它在 Unity 中如何用于运行计算着色器），我想特别说明：
1：从CPU向着色器内核的缓冲区写入一些数据，
2：运行内核一定次数，
3：让CPU从内核读取一些缓冲区。

举个我想要的例子，下面是我如何获得 C# 代码以使用 UnityEngine 运行 HLSL 内核：
（例如：在 C# 中它会生成一些从 -1 到 1 的随机数，然后在着色器中将每个条目乘以 4）
C#

using UnityEngine;

public class Test : MonoBehaviour
{
    ComputeBuffer buffer;
    public ComputeShader shader; //Has been set to reference the shader in Unity

    void Start ()
    {
        //Create array of random values from -1 to 1
        int[] v = new int[4 * 4];
        for (int i = 0; i < v.Length; i++)
        {
            v[i] = Random.Range(-1, 2);
        }
        //Create buffer
        buffer = new ComputeBuffer(v.Length, sizeof(int));
        shader.SetBuffer(0, "Result", buffer);
        //Set values of buffer to random values
        buffer.SetData(v);
        //Execute the shader
        shader.Dispatch(0, 4 / 2, 4 / 2, 1);
        //Get values of buffer
        buffer.GetData(v);
        //Dispose buffer
        buffer.Dispose();
        //Print values
        for (int i = 0; i < v.Length; i += 4)
        {
            print(v[i + 0] + "," + v[i + 1] + "," + v[i + 2] + "," + v[i + 3]);
        }
    }
}

HLSL

#pragma kernel CSMain

RWStructuredBuffer<int> Result;

[numthreads(2,2,1)]
void CSMain (uint3 id : SV_DispatchThreadID)
{
    Result[id.x + id.y * 4] = 4 * Result[id.x + id.y * 4];
}

印刷：

-4, 4, -4, 0
0, 4, 0, 0
-4, -4, -4, 4
4, 0, 4, 4

编辑：我尝试过的事情： 2 周后，虽然没有人回答这个问题，但我一直在寻找解决方案。仍在搜索，但想用我目前看到的一些方法来更新它：
（请注意，例如，我没有仔细研究其中的一些，因为它们似乎不是我想要的）

Microsoft 似乎在编译 Compute Shaders 上有一个 official how-to，但它适用于 C++。如果我在这方面是正确的，那么可能会使用它来创建一个 DLL 以在 C# 中使用。不过，我对这种方法的主要问题是它需要我没有的 C++ 知识。不过我确实打算去看看。
Compute Sharp 是一个 NuGet 包，声称能够获取 C# 代码，将其转换为 HLSL 代码，并在 GPU 上运行。然而，这似乎不是我想要的，因为它不需要 HLSL 代码，而是 C# 代码。无论如何决定检查一下，因为它是一个很小的负面影响，但是尝试在 NuGet (1.3.1) 上安装最新版本，这也是最新的稳定版本，给出了错误“无法安装包 [... ]。您正在尝试将此包安装到以 [.NetFramework 版本 4.7.2] 为目标的项目中，但该包不包含任何与该框架兼容的程序集引用或内容文件。[...]"。因此我决定先检查其他方法。

SharpDX 似乎是一个 NuGet 包，可让您在 C# .NET 项目中使用 HLSL 代码。使用 SharpDX.Direct3D11，您甚至可以创建计算着色器。看起来和我想要的完全一样，这就是为什么我比其他人更关注这个的原因。我刚刚遇到了一个大问题：SharpDX 似乎没有关于如何使用它的教程或解释的形式。使用谷歌翻译所说的Japanese example of a Compute Shader using SharpDX 作为蓝图，我能够得到一些工作。我能做的问题是 1) 有些部分我根本不明白他们在做什么，只是没有它们就无法工作，2) 做出看起来完全可以接受的改变，比如有一个第二个缓冲区，可以以奇怪的方式破坏它。再加上似乎必须通过它们在编译着色器中的位置（而不是它们的预编译位置）来引用结构化缓冲区，这似乎真的很糟糕，而且根据我的一点经验，使用起来并不友好。如果有人知道如何正确使用该软件包，我会假设我遇到的大多数问题都很容易解决，但是这又回到了我身上，实际上并没有找到很多关于如何使用该软件包的教程、解释或示例。如果有人对此感到好奇，我可能会使用 SharpDX 生成 4 个数字 0、1、2、3，并在着色器中将它们乘以 4：

using System;
using SharpDX;
using SharpDX.Direct3D;
using SharpDX.Direct3D11;
using SharpDX.D3DCompiler;
//Based on https://gist.github.com/oguna/624969e732a868ec17f05694012c1b63

namespace C_Sharp_Shader_test
{
    class Program
    {
        static void Main(string[] args)
        {
            int groupSize = 2; //Needs to match what is written in the shader
            int totalSize = 4; //Needs to be a multiple of groupSize, or else the shader will try to either change part of the array past its length, or not change the last parts of the array
            int elementByteSize = 4; //The size of a single element of the input-data in bytes (An int is made of 4 bytes)
            //Create device
            Device device = new Device(DriverType.Hardware, DeviceCreationFlags.SingleThreaded);
            //Create compute shader
            CompilationResult bytecode = ShaderBytecode.CompileFromFile("Shader.hlsl", "CSMain", "cs_5_0"); //(Gotta have the shader-file Shader.hlsl be copied to the output directory for this to work)
            ComputeShader cs = new ComputeShader(device, bytecode);
            bytecode.Dispose();
            //Create input data (0,1,2,3)
            int[] inputData = new int[totalSize];
            for (int i = 0; i < inputData.Length; i++)
            {
                inputData[i] = i;
            }
            for (int i = 0; i < inputData.Length; i++)
            {
                Console.WriteLine(inputData[i]);
            }
            Console.WriteLine("");
            //Create input buffer that has the input data
            BufferDescription inputDesc = new BufferDescription()
            {
                SizeInBytes = elementByteSize * totalSize, //Size of the buffer in bytes
                Usage = ResourceUsage.Default, //Lets the buffer be both written and read by the GPU
                BindFlags = BindFlags.ShaderResource | BindFlags.UnorderedAccess,
                OptionFlags = ResourceOptionFlags.BufferStructured,
                StructureByteStride = elementByteSize, //The size of each element in bytes
                CpuAccessFlags = CpuAccessFlags.Read //Lets the CPU read this buffer
            };
            SharpDX.Direct3D11.Buffer buffer = SharpDX.Direct3D11.Buffer.Create(device, inputData, inputDesc);
            //Create resource view (Seems to just be needed for the buffer)
            ShaderResourceViewDescription srvDesc = new ShaderResourceViewDescription()
            {
                Format = SharpDX.DXGI.Format.Unknown,
                Dimension = ShaderResourceViewDimension.Buffer,
                Buffer = new ShaderResourceViewDescription.BufferResource()
                {
                    ElementWidth = elementByteSize
                }
            };
            ShaderResourceView srvs = new ShaderResourceView(device, buffer, srvDesc);
            //Create access view (Seems to just be needed for the buffer)
            UnorderedAccessViewDescription uavDesc = new UnorderedAccessViewDescription()
            {
                Format = SharpDX.DXGI.Format.Unknown,
                Dimension = UnorderedAccessViewDimension.Buffer,
                Buffer = new UnorderedAccessViewDescription.BufferResource()
                {
                    ElementCount = totalSize
                }
            };
            UnorderedAccessView uavs = new UnorderedAccessView(device, buffer, uavDesc);
            //Set up shader
            DeviceContext context = device.ImmediateContext;
            context.ComputeShader.Set(cs);
            //Set up shader's buffer
            context.ComputeShader.SetConstantBuffer(0, buffer);
            context.ComputeShader.SetShaderResource(0, srvs);
            context.ComputeShader.SetUnorderedAccessView(0, uavs);
            //Execute shader
            int threadGroupCount = (totalSize + groupSize - 1) / groupSize; // +groupSize-1 to round up
            context.Dispatch(threadGroupCount, 1, 1);
            //Set an array "outputData" equal to the buffer's values
            DataStream ds;
            context.MapSubresource(buffer, MapMode.Read, MapFlags.None, out ds);
            int[] outputData = ds.ReadRange<int>(4);
            //Dispose stuff
            context.ClearState();
            Utilities.Dispose(ref srvs);
            Utilities.Dispose(ref uavs);
            Utilities.Dispose(ref buffer);
            Utilities.Dispose(ref cs);
            Utilities.Dispose(ref device);
            //Print values
            for (int i = 0; i < outputData.Length; i++)
            {
                Console.WriteLine(outputData[i]);
            }
            //Wait so it doesn't close the console immediately.
            Console.ReadKey();
        }
    }
}

HLSL

RWStructuredBuffer<int> Result;

[numthreads(2, 1, 1)]
void CSMain(uint3 id : SV_DispatchThreadID)
{
    Result[id.x] = Result[id.x] * 4;
}

输出：

0
1
2
3

0
4
8
12

【问题讨论】：

Compute Sharp 包仅适用于 .net Core >= 3.0 版的 Windows。

标签： c# visual-studio hlsl compute-shader

【解决方案1】：

通过 SharpDX 在 DirextX 中使用计算的简单示例（未优化，仅 POC）。实际的着色器应该是类似的......

关于使用 DirectX 和 HLSL（包括计算）进行编程的一个很好的来源 (imo) 是 'Introduction to 3D Game programming withd DirectX' (Frank D. Luna, isbn 978-1-942270-06-5)

using SharpDX;
using SharpDX.D3DCompiler;
using SharpDX.Direct3D;
using SharpDX.Direct3D11;
using System;
using System.Diagnostics;
using Buffer = SharpDX.Direct3D11.Buffer;

namespace GpGpuDemo.Backend

{
  public class DirectComputeCalculatorWithReadBackSharpDx : IParallelCalculator
  {
    public string Description => "GPU-accelerated via SharpDX/DirectCompute (with readback) ";

    public unsafe void Calculate(float[] arrayA, float[] arrayB, float[] arrayC, Action<string> Report)
    {
        var sw = new Stopwatch();
        int count = arrayA.Length;

        var device = new Device(DriverType.Hardware, DeviceCreationFlags.None);

        const int warpsize = 128;
        string DCShaderSource = @"
        
            StructuredBuffer<float> a;
            StructuredBuffer<float> b;
            RWStructuredBuffer<float> c;
            [numthreads(" + warpsize.ToString() + @",1,1)]
            void VectorAdd(uint3 threadId : SV_DispatchThreadID)
            {
                    uint index = threadId.x;
                    c[index] = a[index] * b[index]+a[index];                                          
            }
        ";

        // Compile the shader.
        var computeShaderCode = ShaderBytecode.Compile(DCShaderSource, "VectorAdd", "cs_5_0", ShaderFlags.None, EffectFlags.None);
        var computeShader = new ComputeShader(device, computeShaderCode);
        device.ImmediateContext.ComputeShader.Set(computeShader);

        // description for input buffers
        var inputBufferDescription = new BufferDescription
        {
            BindFlags = BindFlags.ShaderResource,
            OptionFlags = ResourceOptionFlags.BufferStructured,
            Usage = ResourceUsage.Dynamic,
            CpuAccessFlags = CpuAccessFlags.Write,
            SizeInBytes = count * sizeof(float),
            StructureByteStride = sizeof(float)
        };


        // Description for the output buffer itself, and the view required to bind it to the pipeline.
        var outputBufferDescription = new BufferDescription
        {
            BindFlags = BindFlags.UnorderedAccess,
            OptionFlags = ResourceOptionFlags.BufferStructured,
            Usage = ResourceUsage.Default,
            CpuAccessFlags = CpuAccessFlags.None,
            SizeInBytes = count * sizeof(float),
            StructureByteStride = sizeof(float)
        };



        var stagingBufferDescription = new BufferDescription
        {
            BindFlags = BindFlags.None,
            CpuAccessFlags = CpuAccessFlags.Read,
            OptionFlags = ResourceOptionFlags.BufferStructured,
            SizeInBytes = count * sizeof(float),
            StructureByteStride = sizeof(float),
            Usage = ResourceUsage.Staging,
        };

        var stagingBuffer = new Buffer(device, stagingBufferDescription);
        var outputBuffer = new Buffer(device, outputBufferDescription);

        var outputViewDescription = new UnorderedAccessViewDescription
        {
            Buffer = new UnorderedAccessViewDescription.BufferResource() { FirstElement = 0, Flags = UnorderedAccessViewBufferFlags.None, ElementCount = count },

            Format = SharpDX.DXGI.Format.Unknown,
            Dimension = UnorderedAccessViewDimension.Buffer
        };
        var outputView = new UnorderedAccessView(device, outputBuffer, outputViewDescription);


        float[] DCArrC = new float[count];

        // prepare input buffers

        DataStream dsA;
        fixed (float* aAddress = arrayA)
        {
            dsA = new DataStream((IntPtr)aAddress, System.Buffer.ByteLength(arrayA), true, false);
        }
        var ArrayA = new Buffer(device, dsA, inputBufferDescription);
        var ArrayAView = new ShaderResourceView(device, ArrayA);

        DataStream dsB;
        fixed (float* bAddress = arrayB)
        {
            dsB = new DataStream((IntPtr)bAddress, System.Buffer.ByteLength(arrayB), true, false);
        }
        var ArrayB = new Buffer(device, dsB, inputBufferDescription);
        var ArrayBView = new ShaderResourceView(device, ArrayB);

        DataBox output;
        device.ImmediateContext.ComputeShader.SetUnorderedAccessView(0, outputView);
        device.ImmediateContext.ComputeShader.SetShaderResource(0, ArrayAView);
        device.ImmediateContext.ComputeShader.SetShaderResource(1, ArrayBView);

        for (int i = 0; i < 5; i++)
        {
            sw.Restart();
            for (int teller = 0; teller < 10; teller++)
            {
                device.ImmediateContext.Dispatch(count / warpsize, 1, 1);
            }
            device.ImmediateContext.CopyResource(outputBuffer, stagingBuffer);
            DataStream result;
            output = device.ImmediateContext.MapSubresource(stagingBuffer, MapMode.Read, MapFlags.None, out result);
            fixed (float* cAddress = arrayC)
            {
                result.Read((IntPtr)cAddress, 0, System.Buffer.ByteLength(arrayC));
            }
            device.ImmediateContext.UnmapSubresource(stagingBuffer, 0);
            sw.Stop();
            var s = sw.Elapsed;

            Report($"Operation finished in {s.Minutes} minutes, {s.Seconds} seconds, {s.Milliseconds} milliseconds");
        }
        ArrayA.Dispose();
        ArrayB.Dispose();
        dsA.Dispose();
        dsB.Dispose();
    }
  }
}

您可以mail me 获取完整的工作解决方案，比较 CPU 上的执行情况（单线程和多线程，在 OpenCL 和 DirectCompute 中）

【讨论】：