一个关于 CUDA Warps 的简单代码答案

【问题标题】：A simple Code about CUDA Warps一个关于 CUDA Warps 的简单代码
【发布时间】：2014-02-03 09:55:33
【问题描述】：

我在 Cuda Documentaion 中读到，在每个块中，线程以 32 个称为 warp 的批次执行，每个线程指向相同的指令但可以访问多个数据，我的任务是测试语句的真实性.

现在我所做的是我启动了一个有 256 个线程和一个块的内核，所以 8 个批次经线必须执行。

我将创建一个大小为 32 的共享变量，并将其分配给

sharedVariable [ threadIdx.x % 32 ] = threadIdx.x /32;

然后将该变量分配给 256 字节长度的全局变量：

outputPointer[ threadIdx.x  ] = sharedVariable [ threadIdx.x % 32 ];

理想情况下，根据假设我应该得到输出

0,0,0,0,0,0,0,0,直到 32 1,1,1,1,1,1直到32.. 2,2,2,2,2,直到 32

但我得到的输出只是 4,4,4,4,4

Cuda 代码：

__global__ void addKernel(int *inputPointer, int *outputPointer)
{
    __shared__ int sharedVariable[ 32 ];

    sharedVariable [ threadIdx.x % 32 ] = 0 ;
    sharedVariable [ threadIdx.x % 32 ] = threadIdx.x /32;


    outputPointer[ threadIdx.x  ] = sharedVariable [ threadIdx.x % 32 ];
}

int main () { 
......
 addKernel<<<1, 256>>>(device_inputPointer, device_outputPointer);
......
/**Print output here */
//I am getting 4 ,4,4,4,4,4,4,4,4 as output
}

完整代码：

#include "cuda_runtime.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <conio.h>
#define SIZE  256 * sizeof(int)


__global__ void addKernel(int *inputPointer, int *outputPointer)
{
    __shared__ int sharedVariable[ 32 ];


    sharedVariable [ threadIdx.x % 32 ] = 0;

    sharedVariable [ threadIdx.x % 32 ] = threadIdx.x /32;


    outputPointer[ threadIdx.x  ] = sharedVariable [ threadIdx.x % 32 ];
}



int main()
{





    // Copy input vectors from host memory to GPU buffers.
    int *inputPointer = (int * ) malloc (SIZE);
    int *outputPointer= (int * ) malloc (SIZE);


    int *device_inputPointer;
    int *device_outputPointer;

    cudaMalloc((void**)&device_inputPointer, SIZE);
    cudaMalloc((void**)&device_outputPointer, SIZE);

    memset (inputPointer  , 0  ,  SIZE);
    cudaMemcpy(device_inputPointer , inputPointer,  SIZE , cudaMemcpyHostToDevice);

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, 256>>>(device_inputPointer, device_outputPointer);

    cudaMemcpy(outputPointer, device_outputPointer, SIZE , cudaMemcpyDeviceToHost);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.

    for ( int i = 0 ; i < 256  ; i ++ ) {
        printf ( " %d  " , outputPointer[i] );
    }

    cudaDeviceReset();



    getch();
    return 0;
}

#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <conio.h>
#define SIZE  256 * sizeof(int)


__global__ void addKernel(int *inputPointer, int *outputPointer)
{
    __shared__ int sharedVariable[ 32 ];


    sharedVariable [ threadIdx.x % 32 ] = 0;

    sharedVariable [ threadIdx.x % 32 ] = threadIdx.x /32;


    outputPointer[ threadIdx.x  ] = sharedVariable [ threadIdx.x % 32 ];
}



int main()
{





    // Copy input vectors from host memory to GPU buffers.
    int *inputPointer = (int * ) malloc (SIZE);
    int *outputPointer= (int * ) malloc (SIZE);


    int *device_inputPointer;
    int *device_outputPointer;

    cudaMalloc((void**)&device_inputPointer, SIZE);
    cudaMalloc((void**)&device_outputPointer, SIZE);

    memset (inputPointer  , 0  ,  SIZE);
    cudaMemcpy(device_inputPointer , inputPointer,  SIZE , cudaMemcpyHostToDevice);

    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, 256>>>(device_inputPointer, device_outputPointer);

    cudaMemcpy(outputPointer, device_outputPointer, SIZE , cudaMemcpyDeviceToHost);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.

    for ( int i = 0 ; i < 256  ; i ++ ) {
        printf ( " %d  " , outputPointer[i] );
    }

    cudaDeviceReset();



    getch();
    return 0;
}

我在不同的硬件上测试过在 K20 上（特斯拉架构，它工作正常）

【问题讨论】：

我想不出一种方法来完成你想做的事情，所以我在这里重新提出了这个问题：stackoverflow.com/q/21558351/442006

标签： cuda

【解决方案1】：

我认为您并不完全清楚 cuda 代码是如何并行执行的。 sharedVariable [ threadIdx.x % 32 ] = 0 ; 行完全没用，因为在下一行你用threadIdx.x /32 覆盖它。

您对经线计数的假设也是错误的。您的线程块由 256 个线程组成。所以有 8 条经线 (256/32)。

您不能期望您的代码有任何结果，因为没有明确的行为！ threadIdx.x /32 的结果将在 0 到 7 的范围内，具体取决于范围为 0..255 的 threadIdx.x。因为每 32 个线程中有 8 个 warp，所以将有 8 次写入 sharedVariable [ threadIdx.x % 32 ] 而您无法控制，哪个 warp 将首先执行和最后执行。

在您的情况下，warp 4 最终被执行，因此您的结果只有 4 个。

要从内核中获得您期望的结果，可以将其更改为：

__global__ void addKernel(int *outputPointer)
{
    outputPointer[ threadIdx.x  ] = threadIdx.x /32;
}

我认为没有机会像您希望的那样使用共享内存。

【讨论】：

我认为在上面的代码中执行warp的顺序不是问题，只要一次执行一个warp，上面的代码就必须正常工作？
当然，正如@sgar91 所说，不同经线的线之间也有一个退出条件。如果我在我的系统上启动代码，我只会得到 6 个。

【解决方案2】：

代码在以下几行中有未定义的行为：

 sharedVariable [ threadIdx.x % 32 ] = 0;
 sharedVariable [ threadIdx.x % 32 ] = threadIdx.x /32;

多个线程可以具有相同的threadIdx.x % 32 值，并且这些线程会尝试同时写入相同的共享内存位置。这将导致这些线程之间出现竞争条件。

例如，考虑具有threadIdx.x 0、32、64、96...等的线程。所有这些线程都将尝试访问sharedVariable 的索引0，从而导致未定义的行为。所有线程之间的偏移量为32 的情况也是如此（仅在当前示例中）。

【讨论】：

但是一次只能在硬件中安排一个warp，对吗？因此不会有覆盖。
计划warp 的时间无关紧要。从大局来看，所有线程只是相互独立地执行。正如@hubs 所指出的，线程之间会存在竞争条件，并且内存将具有最后一个线程分配的值。在我的机器上，我在输出中得到5。
对于threadIdx.x = 0、32、64、96、128、160、192 和 224 threadIdx.x % 32 是每次0！这些是每个经线的第一个线程。除以 32 写入sharedVariable [0] 的结果可以是 0,1,2,3,4,5,6 或 7。
@hubs... 没错，但我们不知道最后一个访问内存的 warp。
@sgar91 当然可以，因此我写的可以是 0,1,2.. 抱歉，如果我的意思不完全清楚。