【问题标题】:Cudafy code results different when BlockSize 7 and BlockSize 8当 BlockSize 7 和 BlockSize 8 时,Cudafy 代码结果不同
【发布时间】:2016-11-04 16:18:58
【问题描述】:

我正在使用 Cudafy.NET,但我对 BlockSize 有一些困难。在某些情况下,它会产生不同的结果。很快区别就在这里:

//correct results when using this line
gpu.Launch(1, 7, "kernelfx_alldata", 10, devdata, devnmin, devnmax, devgmin, devgmax, devtest);

//incorrect results when using this line
gpu.Launch(1, 8, "kernelfx_alldata", 10, devdata, devnmin, devnmax, devgmin, devgmax, devtest);

问题详细说明:

我有 10 个项目要循环。 GridSize 为 1。

案例 1:当 CudafyModes.Target = eGPUType.OpenCL 且 BlockSize 为 1,2,3,4,5,6 和 7强>。结果正确

案例 2:CudafyModes.Target = eGPUType.OpenCL 并且 BlockSize 为 8,9,10,11, ....和更多。结果不正确

案例 3:CudafyModes.Target = eGPUType.Emulator 并且 BlockSize 为 1,2,3,4,5,6,7,8, 9,10,11, .... 和更多。结果是 正确

示例代码如下所示。 初始化变量:

double[,] data;
double[] nmin, nmax, gmin, gmax;

void initializeVars()
{
    data = new double[10, 10];
    for (int i = 0; i < 10; i++)
        {
            data[i, 0] = 100 + i;
            data[i, 1] = 32 + i;
            data[i, 2] = 22 + i;
            data[i, 3] = -20 - i;
            data[i, 4] = 5522 + 10 * i;
            data[i, 5] = 40 + i;
            data[i, 6] = 14 - i;
            data[i, 7] = 12 + i;
            data[i, 8] = -10 + i;
            data[i, 9] = 10 + 10 * i;
        }
    nmin = new double[10];
    nmax= new double[10];
    gmin = new double[10];
    gmax = new double[10];
    for (int i = 0; i < 10; i++)
    {
        nmin[i] = -1;
        nmax[i] = 1;
        gmin[i] = i;
        gmax[i] = 11 * i*i+1;
    }
}

gpu 启动代码:

private void button1_Click(object sender, EventArgs e)
{
    CudafyModes.Target = eGPUType.OpenCL;
    CudafyModes.DeviceId = 0;
    CudafyTranslator.Language = eLanguage.OpenCL;
    CudafyModule km = CudafyTranslator.Cudafy();
    Cudafy.Host.GPGPU gpu = Cudafy.Host.CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);
    gpu.LoadModule(km);
    initializeVars();
    double[,] devdata = gpu.Allocate<double>(data); gpu.CopyToDevice(data, devdata);
    double[] devnmin = gpu.Allocate<double>(nmin); gpu.CopyToDevice(nmin, devnmin);
    double[] devnmax = gpu.Allocate<double>(nmax); gpu.CopyToDevice(nmax, devnmax);
    double[] devgmin = gpu.Allocate<double>(gmin); gpu.CopyToDevice(gmin, devgmin);
    double[] devgmax = gpu.Allocate<double>(gmax); gpu.CopyToDevice(gmax, devgmax);
    double[] test = new double[10];
    double[] devtest = gpu.Allocate<double>(test);
    gpu.Launch(1, 8, "kernelfx_alldata", 10, devdata, devnmin,
           devnmax, devgmin, devgmax,  devtest);
    gpu.CopyFromDevice(devtest, test);
    gpu.FreeAll();
}

Cudafy 内核

[Cudafy]
public static void kernelfx_alldata(GThread thread, int N, double[,] data, double[] nmin, double[] nmax, double[] gmin, double[] gmax, double[] test)
{
    int tid = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;
    while (tid < N)
    {
        double[] tmp = thread.AllocateShared<double>("tmp", 10);
        tmp[0] = 1; 
        for (int i = 1; i < 10; i++)
        {
            tmp[i] = data[tid, i - 1];
        }
        for (int i = 1; i < 10; i++)
        {
            tmp[i] = (nmax[i - 1] - nmin[i - 1]) / (gmax[i - 1] - gmin[i - 1]) * (tmp[i] - gmin[i - 1]) + nmin[i - 1];
        }
        test[tid] = tmp[1];

        tid = tid + thread.blockDim.x * thread.gridDim.x;
    }
}

正确的(案例 1 和案例 3)结果是:

test[0]=199.0
test[1]=201.0
test[2]=203.0
test[3]=205.0
test[4]=207.0
test[5]=209.0
test[6]=211.0
test[7]=213.0
test[8]=215.0
test[9]=217.0

不正确的(CASE 2)结果是:

test[0]=213.0
test[1]=213.0
test[2]=213.0
test[3]=213.0
test[4]=213.0
test[5]=213.0
test[6]=213.0
test[7]=213.0
test[8]=217.0
test[9]=217.0

当 BlockSize 小于 8 时,结果是正确的。但是当 BlockSize 大于 8 时,结果是不正确的。为了有效地使用 GPU,blockSize 必须大于 8。

这段代码有什么问题?

最好的问候...

【问题讨论】:

    标签: c# opencl gpgpu gpu cudafy.net


    【解决方案1】:

    将 tmp 声明为 2d 数组,第一列为 threadId 即可解决问题。 工作代码如下:

    [Cudafy]
    public static void kernelfx_alldata(GThread thread, int N, double[,] data, double[] nmin,
                                    double[] nmax, double[] gmin, double[] gmax, double[] test)
    {
        int tid = thread.threadIdx.x + thread.blockIdx.x * thread.blockDim.x;
        double[,] tmp = thread.AllocateShared<double>("tmp", 10, 10);
        while (tid < N)
        {
    
            tmp[tid, 0] = 1;
            for (int i = 1; i < 10; i++)
            {
                tmp[tid, i] = data[tid, i - 1];
            }
            for (int i = 1; i < 10; i++)
            {
                tmp[tid, i] = (nmax[i - 1] - nmin[i - 1]) / (gmax[i - 1] - gmin[i - 1]) * (tmp[tid, i] - gmin[i - 1]) + nmin[i - 1];
            }
            test[tid] = tmp[tid, 1];
    
            tid = tid + thread.blockDim.x * thread.gridDim.x;
        }
    }
    

    【讨论】:

      猜你喜欢
      • 2019-11-10
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2020-02-15
      • 2014-08-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多