Avoiding Branch Divergence
有时,控制流依赖于thread索引。同一个warp中,一个条件分支可能导致很差的性能。通过重新组织数据获取模式可以减少或避免warp divergence(该问题的解释请查看warp解析篇)。
The Parallel Reduction Problem
我们现在要计算一个数组N个元素的和。这个过程用CPU编程很容易实现:
int sum = 0; for (int i = 0; i < N; i++) sum += array[i];
那么如果Array的元素非常多呢?应用并行计算可以大大提升这个过程的效率。鉴于加法的交换律等性质,这个求和过程可以以元素的任意顺序来进行:
- 将输入数组切割成很多小的块。
- 用thread来计算每个块的和。
- 对这些块的结果再求和得最终结果。
数组的切割主旨是,用thread求数组中按一定规律配对的的两个元素和,然后将所有结果组合成一个新的数组,然后再次求配对两元素和,多次迭代,直到数组中只有一个结果。
比较直观的两种实现方式是:
- Neighbored pair:每次迭代都是相邻两个元素求和。
- Interleaved pair:按一定跨度配对两个元素。
下图展示了两种方式的求解过程,对于有N个元素的数组,这个过程需要N-1次求和,log(N)步。Interleaved pair的跨度是半个数组长度。
下面是用递归实现的interleaved pair代码(host):
int recursiveReduce(int *data, int const size) { // terminate check if (size == 1) return data[0]; // renew the stride int const stride = size / 2; // in-place reduction for (int i = 0; i < stride; i++) { data[i] += data[i + stride]; } // call recursively return recursiveReduce(data, stride); }
上述讲的这类问题术语叫reduction problem。Parallel reduction(并行规约)是指迭代减少操作,是并行算法中非常关键的一种操作。
Divergence in Parallel Reduction
这部分以neighbored pair为参考研究:
在这个kernel里面,有两个global memory array,一个用来存放数组所有数据,另一个用来存放部分和。所有block独立的执行求和操作。__syncthreads(关于同步,请看前文)用来保证每次迭代,所有的求和操作都做完,然后进入下一步迭代。
__global__ void reduceNeighbored(int *g_idata, int *g_odata, unsigned int n) { // set thread ID unsigned int tid = threadIdx.x; // convert global data pointer to the local pointer of this block int *idata = g_idata + blockIdx.x * blockDim.x; // boundary check if (idx >= n) return; // in-place reduction in global memory for (int stride = 1; stride < blockDim.x; stride *= 2) { if ((tid % (2 * stride)) == 0) { idata[tid] += idata[tid + stride]; } // synchronize within block __syncthreads(); } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = idata[0]; }
因为没有办法让所有的block同步,所以最后将所有block的结果送回host来进行串行计算,如下图所示:
main代码:
int main(int argc, char **argv) { // set up device int dev = 0; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); printf("%s starting reduction at ", argv[0]); printf("device %d: %s ", dev, deviceProp.name); cudaSetDevice(dev); bool bResult = false; // initialization int size = 1<<24; // total number of elements to reduce printf(" with array size %d ", size); // execution configuration int blocksize = 512; // initial block size if(argc > 1) { blocksize = atoi(argv[1]); // block size from command line argument } dim3 block (blocksize,1); dim3 grid ((size+block.x-1)/block.x,1); printf("grid %d block %d\n",grid.x, block.x); // allocate host memory size_t bytes = size * sizeof(int); int *h_idata = (int *) malloc(bytes); int *h_odata = (int *) malloc(grid.x*sizeof(int)); int *tmp = (int *) malloc(bytes); // initialize the array for (int i = 0; i < size; i++) { // mask off high 2 bytes to force max number to 255 h_idata[i] = (int)(rand() & 0xFF); } memcpy (tmp, h_idata, bytes); size_t iStart,iElaps; int gpu_sum = 0; // allocate device memory int *d_idata = NULL; int *d_odata = NULL; cudaMalloc((void **) &d_idata, bytes); cudaMalloc((void **) &d_odata, grid.x*sizeof(int)); // cpu reduction iStart = seconds (); int cpu_sum = recursiveReduce(tmp, size); iElaps = seconds () - iStart; printf("cpu reduce elapsed %d ms cpu_sum: %d\n",iElaps,cpu_sum); // kernel 1: reduceNeighbored cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice); cudaDeviceSynchronize(); iStart = seconds (); warmup<<<grid, block>>>(d_idata, d_odata, size); cudaDeviceSynchronize(); iElaps = seconds () - iStart; cudaMemcpy(h_odata, d_odata, grid.x*sizeof(int), cudaMemcpyDeviceToHost); gpu_sum = 0; for (int i=0; i<grid.x; i++) gpu_sum += h_odata[i]; printf("gpu Warmup elapsed %d ms gpu_sum: %d <<<grid %d block %d>>>\n", iElaps,gpu_sum,grid.x,block.x); // kernel 1: reduceNeighbored cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice); cudaDeviceSynchronize(); iStart = seconds (); reduceNeighbored<<<grid, block>>>(d_idata, d_odata, size); cudaDeviceSynchronize(); iElaps = seconds () - iStart; cudaMemcpy(h_odata, d_odata, grid.x*sizeof(int), cudaMemcpyDeviceToHost); gpu_sum = 0; for (int i=0; i<grid.x; i++) gpu_sum += h_odata[i]; printf("gpu Neighbored elapsed %d ms gpu_sum: %d <<<grid %d block %d>>>\n", iElaps,gpu_sum,grid.x,block.x); cudaDeviceSynchronize(); iElaps = seconds() - iStart; cudaMemcpy(h_odata, d_odata, grid.x/8*sizeof(int), cudaMemcpyDeviceToHost); gpu_sum = 0; for (int i = 0; i < grid.x / 8; i++) gpu_sum += h_odata[i]; printf("gpu Cmptnroll elapsed %d ms gpu_sum: %d <<<grid %d block %d>>>\n", iElaps,gpu_sum,grid.x/8,block.x); /// free host memory free(h_idata); free(h_odata); // free device memory cudaFree(d_idata); cudaFree(d_odata); // reset device cudaDeviceReset(); // check the results bResult = (gpu_sum == cpu_sum); if(!bResult) printf("Test failed!\n"); return EXIT_SUCCESS; }