CUDA 对于 32 位和 64 位类型都有 population count intrinsics。 (__popc() 和__popcll())
这些可以直接在 CUDA 内核中使用,或者通过推力(在函子中)可能传递给thrust::transform_reduce。
如果这是您希望在 GPU 上执行的唯一功能,则可能很难获得净“胜利”,因为向/从 GPU 传输数据的“成本”。您的整个输入数据集的大小似乎约为 1GB(100000 个位长为 65536 的向量),但根据我的计算,输出数据集的大小似乎为 10-40GB(100000 * 100000 * 每个结果 1-4 个字节) .
无论是 CUDA 内核还是推力函数和数据布局都应该仔细设计,目标是让代码运行仅受内存带宽的限制。数据传输的成本也可以通过复制和计算操作的重叠(主要是在输出数据集上)在很大程度上降低。
乍一看,这个问题似乎有点类似于计算向量集之间的欧几里得距离的问题,所以从 CUDA 的角度来看,this question/answer 可能很有趣。
编辑:添加一些我用来调查此问题的代码。我能够在一个简单的单线程 CPU 实现上获得显着的加速(约 25 倍,包括数据复制时间),但我不知道 CPU 版本使用“分块多线程和 SSE/AVX”的速度有多快,所以它看到更多你的实现或获得一些性能数据会很有趣。我也不认为我这里的 CUDA 代码是高度优化的,它只是“第一次切割”。
在这种情况下,为了进行概念验证,我专注于一个小问题,N=2048, 10000 个位集。对于这个小问题大小,我可以在共享内存中容纳足够多的位集向量,对于“小”线程块大小,以利用共享内存。因此,必须针对更大的N 修改此特定方法。
$ cat t581.cu
#include <iostream>
#include <vector>
#include <bitset>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#define nTPB 128
#define OUT_CHUNK 250
#define N_bits 2048
#define N_vecs 10000
const size_t N = N_bits;
__global__ void comp_dist(unsigned *in, unsigned *out, unsigned numvecs, unsigned start_idx, unsigned end_idx){
__shared__ unsigned sdata[(N/32)*nTPB];
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < numvecs)
for (int i = 0; i < (N/32); i++)
sdata[(i*nTPB)+threadIdx.x] = in[(i*numvecs)+idx];
__syncthreads();
int vidx = start_idx;
if (idx < numvecs)
while (vidx < end_idx) {
unsigned sum = 0;
for (int i = 0; i < N/32; i++)
sum += __popc(sdata[(i*nTPB)+ threadIdx.x] & in[(i*numvecs)+vidx]);
out[((vidx-start_idx)*numvecs)+idx] = sum;
vidx++;}
}
void cpu_test(std::vector<std::bitset<N> > &in, std::vector<std::vector<unsigned> > &out){
for (int i=0; i < in.size(); i++)
for (int j=0; j< in.size(); j++)
out[i][j] = (in[i] & in[j]).count();
}
int check_data(unsigned *d1, unsigned start_idx, std::vector<std::vector<unsigned> > &d2){
for (int i = start_idx; i < start_idx+OUT_CHUNK; i++)
for (int j = 0; j<N_vecs; j++)
if (d1[((i-start_idx)*N_vecs)+j] != d2[i][j]) {std::cout << "mismatch at " << i << "," << j << " was: " << d1[((i-start_idx)*N_vecs)+j] << " should be: " << d2[i][j] << std::endl; return 1;}
return 0;
}
unsigned long long get_time_usec(){
timeval tv;
gettimeofday(&tv, 0);
return (unsigned long long)(((unsigned long long)tv.tv_sec*1000000ULL)+(unsigned long long)tv.tv_usec);
}
int main(){
unsigned long long t1, t2;
std::vector<std::vector<unsigned> > distances;
std::vector<std::bitset<N> > bits;
for (int i = 0; i < N_vecs; i++){
std::vector<unsigned> dist_row(N_vecs, 0);
distances.push_back(dist_row);
std::bitset<N> data;
for (int j =0; j < N; j++) data[j] = rand() & 1;
bits.push_back(data);}
t1 = get_time_usec();
cpu_test(bits, distances);
t1 = get_time_usec() - t1;
unsigned *h_data = new unsigned[(N/32)*N_vecs];
memset(h_data, 0, (N/32)*N_vecs*sizeof(unsigned));
for (int i = 0; i < N_vecs; i++)
for (int j = 0; j < N; j++)
if (bits[i][j]) h_data[(i)+((j/32)*N_vecs)] |= 1U<<(31-(j&31));
unsigned *d_in, *d_out1, *d_out2, *h_out1, *h_out2;
cudaMalloc(&d_in, (N/32)*N_vecs*sizeof(unsigned));
cudaMalloc(&d_out1, N_vecs*OUT_CHUNK*sizeof(unsigned));
cudaMalloc(&d_out2, N_vecs*OUT_CHUNK*sizeof(unsigned));
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
h_out1 = new unsigned[N_vecs*OUT_CHUNK];
h_out2 = new unsigned[N_vecs*OUT_CHUNK];
t2 = get_time_usec();
cudaMemcpy(d_in, h_data, (N/32)*N_vecs*sizeof(unsigned), cudaMemcpyHostToDevice);
for (int i = 0; i < N_vecs; i += 2*OUT_CHUNK){
comp_dist<<<(N_vecs + nTPB - 1)/nTPB, nTPB, 0, stream1>>>(d_in, d_out1, N_vecs, i, i+OUT_CHUNK);
cudaStreamSynchronize(stream2);
if (i > 0) if (check_data(h_out2, i-OUT_CHUNK, distances)) return 1;
comp_dist<<<(N_vecs + nTPB - 1)/nTPB, nTPB, 0, stream2>>>(d_in, d_out2, N_vecs, i+OUT_CHUNK, i+2*OUT_CHUNK);
cudaMemcpyAsync(h_out1, d_out1, N_vecs*OUT_CHUNK*sizeof(unsigned), cudaMemcpyDeviceToHost, stream1);
cudaMemcpyAsync(h_out2, d_out2, N_vecs*OUT_CHUNK*sizeof(unsigned), cudaMemcpyDeviceToHost, stream2);
cudaStreamSynchronize(stream1);
if (check_data(h_out1, i, distances)) return 1;
}
cudaDeviceSynchronize();
t2 = get_time_usec() - t2;
std::cout << "cpu time: " << ((float)t1)/(float)1000 << "ms gpu time: " << ((float) t2)/(float)1000 << "ms" << std::endl;
return 0;
}
$ nvcc -O3 -arch=sm_20 -o t581 t581.cu
$ ./t581
cpu time: 20324.1ms gpu time: 753.76ms
$
CUDA 6.5、Fedora20、至强 X5560、Quadro5000 (cc2.0) GPU。上述测试用例包括在 CPU 与 GPU 上产生的距离数据之间的结果验证。我还将其分解为一个分块算法,其中结果数据传输(和验证)与计算操作重叠,以使其更容易扩展到有大量输出数据(例如 100000 个位集)的情况。但是,我实际上还没有通过探查器运行它。
编辑 2:这是代码的“Windows 版本”:
#include <iostream>
#include <vector>
#include <bitset>
#include <stdlib.h>
#include <time.h>
#define nTPB 128
#define OUT_CHUNK 250
#define N_bits 2048
#define N_vecs 10000
const size_t N = N_bits;
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void comp_dist(unsigned *in, unsigned *out, unsigned numvecs, unsigned start_idx, unsigned end_idx){
__shared__ unsigned sdata[(N/32)*nTPB];
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < numvecs)
for (int i = 0; i < (N/32); i++)
sdata[(i*nTPB)+threadIdx.x] = in[(i*numvecs)+idx];
__syncthreads();
int vidx = start_idx;
if (idx < numvecs)
while (vidx < end_idx) {
unsigned sum = 0;
for (int i = 0; i < N/32; i++)
sum += __popc(sdata[(i*nTPB)+ threadIdx.x] & in[(i*numvecs)+vidx]);
out[((vidx-start_idx)*numvecs)+idx] = sum;
vidx++;}
}
void cpu_test(std::vector<std::bitset<N> > &in, std::vector<std::vector<unsigned> > &out){
for (unsigned i=0; i < in.size(); i++)
for (unsigned j=0; j< in.size(); j++)
out[i][j] = (in[i] & in[j]).count();
}
int check_data(unsigned *d1, unsigned start_idx, std::vector<std::vector<unsigned> > &d2){
for (unsigned i = start_idx; i < start_idx+OUT_CHUNK; i++)
for (unsigned j = 0; j<N_vecs; j++)
if (d1[((i-start_idx)*N_vecs)+j] != d2[i][j]) {std::cout << "mismatch at " << i << "," << j << " was: " << d1[((i-start_idx)*N_vecs)+j] << " should be: " << d2[i][j] << std::endl; return 1;}
return 0;
}
unsigned long long get_time_usec(){
return (unsigned long long)((clock()/(float)CLOCKS_PER_SEC)*(1000000ULL));
}
int main(){
unsigned long long t1, t2;
std::vector<std::vector<unsigned> > distances;
std::vector<std::bitset<N> > bits;
for (int i = 0; i < N_vecs; i++){
std::vector<unsigned> dist_row(N_vecs, 0);
distances.push_back(dist_row);
std::bitset<N> data;
for (int j =0; j < N; j++) data[j] = rand() & 1;
bits.push_back(data);}
t1 = get_time_usec();
cpu_test(bits, distances);
t1 = get_time_usec() - t1;
unsigned *h_data = new unsigned[(N/32)*N_vecs];
memset(h_data, 0, (N/32)*N_vecs*sizeof(unsigned));
for (int i = 0; i < N_vecs; i++)
for (int j = 0; j < N; j++)
if (bits[i][j]) h_data[(i)+((j/32)*N_vecs)] |= 1U<<(31-(j&31));
unsigned *d_in, *d_out1, *d_out2, *h_out1, *h_out2;
cudaMalloc(&d_in, (N/32)*N_vecs*sizeof(unsigned));
cudaMalloc(&d_out1, N_vecs*OUT_CHUNK*sizeof(unsigned));
cudaMalloc(&d_out2, N_vecs*OUT_CHUNK*sizeof(unsigned));
cudaCheckErrors("cudaMalloc fail");
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaCheckErrors("cudaStrem fail");
h_out1 = new unsigned[N_vecs*OUT_CHUNK];
h_out2 = new unsigned[N_vecs*OUT_CHUNK];
t2 = get_time_usec();
cudaMemcpy(d_in, h_data, (N/32)*N_vecs*sizeof(unsigned), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy fail");
for (int i = 0; i < N_vecs; i += 2*OUT_CHUNK){
comp_dist<<<(N_vecs + nTPB - 1)/nTPB, nTPB, 0, stream1>>>(d_in, d_out1, N_vecs, i, i+OUT_CHUNK);
cudaCheckErrors("cuda kernel loop 1 fail");
cudaStreamSynchronize(stream2);
if (i > 0) if (check_data(h_out2, i-OUT_CHUNK, distances)) return 1;
comp_dist<<<(N_vecs + nTPB - 1)/nTPB, nTPB, 0, stream2>>>(d_in, d_out2, N_vecs, i+OUT_CHUNK, i+2*OUT_CHUNK);
cudaCheckErrors("cuda kernel loop 2 fail");
cudaMemcpyAsync(h_out1, d_out1, N_vecs*OUT_CHUNK*sizeof(unsigned), cudaMemcpyDeviceToHost, stream1);
cudaMemcpyAsync(h_out2, d_out2, N_vecs*OUT_CHUNK*sizeof(unsigned), cudaMemcpyDeviceToHost, stream2);
cudaCheckErrors("cuda kernel loop 3 fail");
cudaStreamSynchronize(stream1);
if (check_data(h_out1, i, distances)) return 1;
}
cudaDeviceSynchronize();
cudaCheckErrors("cuda kernel loop 4 fail");
t2 = get_time_usec() - t2;
std::cout << "cpu time: " << ((float)t1)/(float)1000 << "ms gpu time: " << ((float) t2)/(float)1000 << "ms" << std::endl;
return 0;
}
我已在此代码中添加了 CUDA 错误检查。请务必在 Visual Studio 中构建 release 项目,而不是调试。当我在配备 Quadro1000M GPU 的 Windows 7 笔记本电脑上运行此程序时,CPU 执行大约需要 35 秒,GPU 大约需要 1.5 秒。