【发布时间】:2014-02-03 09:55:33
【问题描述】:
我在 Cuda Documentaion 中读到,在每个块中,线程以 32 个称为 warp 的批次执行,每个线程指向相同的指令但可以访问多个数据,我的任务是测试语句的真实性.
现在我所做的是我启动了一个有 256 个线程和一个块的内核,所以 8 个批次 经线必须执行。
我将创建一个大小为 32 的共享变量,并将其分配给
sharedVariable [ threadIdx.x % 32 ] = threadIdx.x /32;
然后将该变量分配给 256 字节长度的全局变量:
outputPointer[ threadIdx.x ] = sharedVariable [ threadIdx.x % 32 ];
理想情况下,根据假设我应该得到输出
0,0,0,0,0,0,0,0,直到 32 1,1,1,1,1,1直到32.. 2,2,2,2,2,直到 32
但我得到的输出只是 4,4,4,4,4
Cuda 代码:
__global__ void addKernel(int *inputPointer, int *outputPointer)
{
__shared__ int sharedVariable[ 32 ];
sharedVariable [ threadIdx.x % 32 ] = 0 ;
sharedVariable [ threadIdx.x % 32 ] = threadIdx.x /32;
outputPointer[ threadIdx.x ] = sharedVariable [ threadIdx.x % 32 ];
}
int main () {
......
addKernel<<<1, 256>>>(device_inputPointer, device_outputPointer);
......
/**Print output here */
//I am getting 4 ,4,4,4,4,4,4,4,4 as output
}
完整代码:
#include "cuda_runtime.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <conio.h>
#define SIZE 256 * sizeof(int)
__global__ void addKernel(int *inputPointer, int *outputPointer)
{
__shared__ int sharedVariable[ 32 ];
sharedVariable [ threadIdx.x % 32 ] = 0;
sharedVariable [ threadIdx.x % 32 ] = threadIdx.x /32;
outputPointer[ threadIdx.x ] = sharedVariable [ threadIdx.x % 32 ];
}
int main()
{
// Copy input vectors from host memory to GPU buffers.
int *inputPointer = (int * ) malloc (SIZE);
int *outputPointer= (int * ) malloc (SIZE);
int *device_inputPointer;
int *device_outputPointer;
cudaMalloc((void**)&device_inputPointer, SIZE);
cudaMalloc((void**)&device_outputPointer, SIZE);
memset (inputPointer , 0 , SIZE);
cudaMemcpy(device_inputPointer , inputPointer, SIZE , cudaMemcpyHostToDevice);
// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, 256>>>(device_inputPointer, device_outputPointer);
cudaMemcpy(outputPointer, device_outputPointer, SIZE , cudaMemcpyDeviceToHost);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
for ( int i = 0 ; i < 256 ; i ++ ) {
printf ( " %d " , outputPointer[i] );
}
cudaDeviceReset();
getch();
return 0;
}
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <conio.h>
#define SIZE 256 * sizeof(int)
__global__ void addKernel(int *inputPointer, int *outputPointer)
{
__shared__ int sharedVariable[ 32 ];
sharedVariable [ threadIdx.x % 32 ] = 0;
sharedVariable [ threadIdx.x % 32 ] = threadIdx.x /32;
outputPointer[ threadIdx.x ] = sharedVariable [ threadIdx.x % 32 ];
}
int main()
{
// Copy input vectors from host memory to GPU buffers.
int *inputPointer = (int * ) malloc (SIZE);
int *outputPointer= (int * ) malloc (SIZE);
int *device_inputPointer;
int *device_outputPointer;
cudaMalloc((void**)&device_inputPointer, SIZE);
cudaMalloc((void**)&device_outputPointer, SIZE);
memset (inputPointer , 0 , SIZE);
cudaMemcpy(device_inputPointer , inputPointer, SIZE , cudaMemcpyHostToDevice);
// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, 256>>>(device_inputPointer, device_outputPointer);
cudaMemcpy(outputPointer, device_outputPointer, SIZE , cudaMemcpyDeviceToHost);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
for ( int i = 0 ; i < 256 ; i ++ ) {
printf ( " %d " , outputPointer[i] );
}
cudaDeviceReset();
getch();
return 0;
}
我在不同的硬件上测试过 在 K20 上(特斯拉架构,它工作正常)
【问题讨论】:
-
我想不出一种方法来完成你想做的事情,所以我在这里重新提出了这个问题:stackoverflow.com/q/21558351/442006
标签: cuda