参考: Shane Cook. CUDA Programming: A developer’s guide to parallel computing with GPUs
共享内存
共享内存是一块特殊的内存, 因为它存在于芯片上并且存取速度比全局内存快.
可以在共享内存上创建一个包含256个bin的局部统计直方图,
最后将所有共享内存上计算得到的统计直方图通过原子操作汇总到全局内存.
这样可以节省存储直方图结果的时间.
下图是GTX1050 的内存容量.
分治
// 共享内存.
__shared__ Cuda32u d_bin_data_shared[256];
__global__ void myhistogram256Kernel_07(const Cuda32u *d_hist_data, Cuda32u * d_bin_data, Cuda32u N)
{
// thread id
const Cuda32u idx = blockIdx.x * blockDim.x + threadIdx.x;
const Cuda32u idy = blockIdx.y *blockDim.y + threadIdx.y;
const Cuda32u tid = idx + idy*blockDim.x*gridDim.x;
//clear shared memory
d_bin_data_shared[threadIdx.x] = 0;
//wait
__syncthreads();
for (Cuda32u i = 0, tid_offset = 0; i < N; i++, tid_offset += 256)
{
const Cuda32u value_u32 = d_hist_data[tid + tid_offset];
atomicAdd(&(d_bin_data_shared[((value_u32 & 0x000000FF))]), 1);
atomicAdd(&(d_bin_data_shared[((value_u32 & 0x0000FF00) >> 8)]), 1);
atomicAdd(&(d_bin_data_shared[((value_u32 & 0x00FF0000) >> 16)]), 1);
atomicAdd(&(d_bin_data_shared[((value_u32 & 0xFF000000) >> 24)]), 1);
}
// wait
__syncthreads();
// write
atomicAdd(&(d_bin_data[threadIdx.x]), d_bin_data_shared[threadIdx.x]);
}
void cudaHist_07(Cuda32u *d_hist_data,Cuda32u * d_bin_data, Cuda32u N, Cuda32u uBinSize=256)
{
dim3 thread_rect(uBinSize, 1); // 和uBinSize相同.
dim3 block_rect(16,16);
myhistogram256Kernel_07<<<block_rect, thread_rect >>>(d_hist_data, d_bin_data, N);
}
调用:
// CPU 数据初始化
const Cuda32u uArraySize = 256*256*256;
const Cuda32u uBinSize = 256;
Cuda8u *h_puchData = (Cuda8u *)malloc(uArraySize*sizeof(Cuda8u));
for (int i = 0; i < uArraySize; i++)
{
h_puchData[i] = rand() % uBinSize;
}
Cuda32u h_puHist[uBinSize] = { 0 };
Cuda32u N = 64;
Cuda32u iIterNum = 10;
// 使用CPU计算
//
StartTimer();
for (Cuda32u i = 0; i < iIterNum;i++)
{
cpuHist(h_puchData, h_puHist, uArraySize, uBinSize);
}
double dblTimeElps = GetTimer();
Cuda32u iSumC = 0;
for (Cuda32u i = 0; i < uBinSize; i++)
{
iSumC += h_puHist[i];
}
printf("\n%%%%%%%%%%%%%% CPU 计算直方图:%%%%%%%%%%%%%%\n");
printf("序列长度 = %d\n", uArraySize);
printf("重复次数 = %d\n", iIterNum);
printf("Hist累计 = %d\n", iSumC / iIterNum);
printf("平均用时 = %fms\n", dblTimeElps / (Cuda64f)iIterNum);
printf("%%%%%%%%%%%%%% CPU 计算直方图:%%%%%%%%%%%%%%\n");
// 先将CPU里的数据搬移到GPU中!
memset((void*)h_puHist, 0, uBinSize*sizeof(Cuda32u));
Cuda8u * d_puchData = NULL;
Cuda32u * d_puHist = NULL;
checkCudaErrors(cudaMalloc((void**)&d_puchData, uArraySize*sizeof(Cuda8u)));
checkCudaErrors(cudaMalloc((void**)&d_puHist, uBinSize*sizeof(Cuda32u)));
checkCudaErrors(cudaMemcpy((void*)d_puchData, (void*)h_puchData, uArraySize*sizeof(Cuda8u), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy((void*)d_puHist, (void*)h_puHist, uBinSize*sizeof(Cuda32u), cudaMemcpyHostToDevice));
// 预热
cudaAdd();
// 开始计时
cudaEvent_t start, stop;
Cuda32f elapsedTime = 0.0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (Cuda32u i = 0; i < iIterNum;i++)
{
// 求直方图
cudaHist_07((Cuda32u*)d_puchData, d_puHist, N);
//cudaHist_01(d_puchData, d_puHist);
}
// 结束计时
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
// 将GPU内的数据拷回CPU
checkCudaErrors(cudaMemcpy((void*)h_puHist, (void*)d_puHist, uBinSize*sizeof(Cuda32u), cudaMemcpyDeviceToHost));
iSumC = 0;
for (Cuda32u i = 0; i < uBinSize; i++)
{
iSumC += h_puHist[i];
}
printf("\n%%%%%%%%%%%%%% CUDA 计算直方图:%%%%%%%%%%%%%%\n");
printf("序列长度 = %d\n", uArraySize);
printf("重复次数 = %d\n", iIterNum);
printf("Hist累计 = %d\n", iSumC / iIterNum);
printf("平均用时 = %f ms\n", elapsedTime / (Cuda32u)iIterNum);
printf("%%%%%%%%%%%%%% CUDA 计算直方图:%%%%%%%%%%%%%%\n\n");
// 释放资源
checkCudaErrors(cudaFree((void*)d_puchData));
checkCudaErrors(cudaFree((void*)d_puHist));
cudaDeviceReset();
输出结果:
使用共享内存的提升比单纯原子操作的提升要高很多.
顺便说一句, 这里用到了线程束合并. 将4个unsigned char合并为一个unsigned int读取.
但是在这一个问题中, 此方法加速不明显.