【发布时间】:2017-08-08 06:31:33
【问题描述】:
我对自己编写的 CUDA 代码的行为感到困惑。我正在为一个名为DimmedGridGPU 的类中的__device__ 函数编写测试。此类以int DIM 为模板,我遇到问题的函数旨在返回最接近输入值x 的点的网格值。我有这个内核命名空间用于单元测试,单独调用每个__device__ 函数。
此代码的期望行为是从do_get_value(x, grid_) 调用返回值3.0,并将d_target[0] 设置为此值,然后将其传输回主机端以进行单元测试断言。整个内核似乎运行正常,但是当我最终传输回主机端时,我收到cudaErrorInvalidValue 错误,我不明白为什么。
这是代码的最小示例,保留了类的结构及其特征:
#include <cuda_runtime.h>
#include <fstream>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: \"%s\": %s %s %d\n", cudaGetErrorName(code), cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
template <int DIM>
class DimmedGridGPU{
public:
size_t grid_size_;//total size of grid
int b_derivatives_;//if derivatives are going to be used
int b_interpolate_;//if interpolation should be used on the grid
double* grid_;//the grid values
double* grid_deriv_;//derivatives
double dx_[DIM];//grid spacing
double min_[DIM];//grid minimum
double max_[DIM];//maximum
int grid_number_[DIM];//number of points on grid
int b_periodic_[DIM];//if a dimension is periodic
int* d_b_interpolate_;
int* d_b_derivatives_;
DimmedGridGPU(const double* min,
const double* max,
const double* bin_spacing,
const int* b_periodic,
int b_derivatives,
int b_interpolate) : b_derivatives_(b_derivatives), b_interpolate_(b_interpolate), grid_(NULL), grid_deriv_(NULL){
size_t i;
for(i = 0; i < DIM; i++) {
min_[i] = min[i];
max_[i] = max[i];
b_periodic_[i] = b_periodic[i];
grid_number_[i] = (int) ceil((max_[i] - min_[i]) / bin_spacing[i]);
dx_[i] = (max_[i] - min_[i]) / grid_number_[i];
//add one to grid points if
grid_number_[i] = b_periodic_[i] ? grid_number_[i] : grid_number_[i] + 1;
//increment dx to compensate
if(!b_periodic_[i])
max_[i] += dx_[i];
}
grid_size_ = 1;
for(i = 0; i < DIM; i++)
grid_size_ *= grid_number_[i];
gpuErrchk(cudaMallocManaged(&grid_, grid_size_ * sizeof(double)));
if(b_derivatives_) {
gpuErrchk(cudaMallocManaged(&grid_deriv_, DIM * grid_size_ * sizeof(double)));
if(!grid_deriv_) {
printf("Out of memory!! gpugrid.cuh:initialize");
}
}
gpuErrchk(cudaMalloc((void**)&d_b_interpolate_, sizeof(int)));
gpuErrchk(cudaMemcpy(d_b_interpolate_, &b_interpolate, sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMalloc((void**)&d_b_derivatives_, sizeof(int)));
gpuErrchk(cudaMemcpy(d_b_derivatives_, &b_derivatives, sizeof(int), cudaMemcpyHostToDevice));
}
~DimmedGridGPU(){
gpuErrchk(cudaDeviceSynchronize());
if(grid_ != NULL){
gpuErrchk(cudaFree(grid_));
grid_ = NULL;//need to do this so DimmedGrid's destructor functions properly
}
if(grid_deriv_ != NULL){
gpuErrchk(cudaFree(grid_deriv_));
grid_deriv_ = NULL;
}
gpuErrchk(cudaDeviceReset());
}
//gets the value of the grid closest to x
__host__ __device__ double do_get_value( double* x, double* grid_) {
size_t index[DIM];
get_index(x, index);
printf("do_get_value was called on the GPU!, and index[0] is now %d\n", index[0]);
printf("but multi2one(index) gives us %d\n", multi2one(index));
double value = grid_[multi2one(index)];
printf("and value to be returned is %f\n", value);
return value;
}
//gets grid's 1D index from an array of coordinates
__host__ __device__ void get_index(const double* x, size_t result[DIM]) const {
size_t i;
double xi;
printf("get_index was called on the GPU in %i dimension(s)\n", DIM);
for(i = 0; i < DIM; i++) {
xi = x[i];
printf("xi is now %f, min_[i] is %f and dx_[i] is %f\n",xi, min_[i], dx_[i]);
if(b_periodic_[i]){
xi -= (max_[i] - min_[i]) * gpu_int_floor((xi - min_[i]) / (max_[i] - min_[i]));
}
result[i] = (size_t) floor((xi - min_[i]) / dx_[i]);
}
}
//takes a multidimensional index to a 1D index
__host__ __device__ size_t multi2one(const size_t index[DIM]) const {
size_t result = index[DIM-1];
size_t i;
for(i = DIM - 1; i > 0; i--) {
result = result * grid_number_[i-1] + index[i-1];
}
return result;
}
};
__host__ __device__ int gpu_int_floor(double number) {
return (int) number < 0.0 ? -ceil(fabs(number)) : floor(number);
}
namespace kernels{
template <int DIM>
__global__ void get_value_kernel(double* x, double* target_arr, double* grid_, DimmedGridGPU<DIM> g){
target_arr[0] = g.do_get_value(x, grid_);
printf("get_value_kernel has set target[0] to be %f\n", target_arr[0]);//check if the value is set correctly
return;
}
}
int main(){
using namespace kernels;
double min[] = {0};
double max[] = {10};
double bin_spacing[] = {1};
int periodic[] = {0};
DimmedGridGPU<1> g (min, max, bin_spacing, periodic, 0, 0);
for(int i = 0; i < 11; i++){
g.grid_[i] = i;
printf("g.grid_[%d] is now %f\n", i, g.grid_[i]);
}
gpuErrchk(cudaDeviceSynchronize());
double x[] = {3.5};
double* d_x;
gpuErrchk(cudaMalloc(&d_x, sizeof(double)));
gpuErrchk(cudaMemcpy(d_x, x, sizeof(double), cudaMemcpyHostToDevice));
double target[] = {5.0};
double* d_target;
gpuErrchk(cudaMalloc((void**)&d_target, sizeof(double)));
gpuErrchk(cudaMemcpy(d_target, target, sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaDeviceSynchronize());
get_value_kernel<1><<<1,1>>>(d_x, d_target, g.grid_, g);
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(target, d_target, sizeof(double), cudaMemcpyDeviceToHost));
printf("and after GPU stuff, target[0] is now %f\n", target[0]);
return(0);
}
那么,为什么这行(最后一个cudaMemcpy)会抛出错误“CudaErrorInvalidValue”,而我包含的打印语句清楚地表明设备上正在使用正确的值,并且返回的值由do_get_value(x, grid_) 调用正确吗?
我已经尝试过使用cudaMemcpyFromSymbol,我认为也许赋值是创建一个符号,而不是以某种方式传递和更改一个值,但事实并非如此,因为d_target 不是一个有效的符号。
这是我的代码的示例输出:
g.grid_[0] is now 0.000000
g.grid_[1] is now 1.000000
g.grid_[2] is now 2.000000
g.grid_[3] is now 3.000000
g.grid_[4] is now 4.000000
g.grid_[5] is now 5.000000
g.grid_[6] is now 6.000000
g.grid_[7] is now 7.000000
g.grid_[8] is now 8.000000
g.grid_[9] is now 9.000000
g.grid_[10] is now 10.000000
get_index was called on the GPU in 1 dimension(s)
xi is now 3.500000, min_[i] is 0.000000 and dx_[i] is 1.000000
do_get_value was called on the GPU!, and index[0] is now 3
but multi2one(index) gives us 3
and value to be returned is 3.000000
get_value_kernel has set target[0] to be 3.000000
GPUassert: "cudaErrorInvalidValue": invalid argument gpugrid.cu 166
【问题讨论】:
标签: c++ class templates cuda gpu