【问题标题】:Cuda GPUassert: an illegal memory access was encounteredCuda GPUassert:遇到非法内存访问
【发布时间】:2021-03-25 09:06:15
【问题描述】:

我试图使用 __device __ 变量而不是使用 cudaMalloc 动态声明它来制作游戏程序,但它一直告诉我 GPUassert: 在倒数第三行遇到非法内存访问调用 cudaDeviceSynchronization()。我已经尝试使用 cudaMalloc 的版本,效果很好。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

#define M 3
#define N 3
#define K 3

using namespace std;

__device__ double A_dev[M * K];
__device__ double B_dev[K * N];
__device__ double C_dev[M * N];

__global__ void gemm(double* A, double* B, double* C, int m, int n, int k)
{
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;

    int i = x * n + y;
    
    double sum = 0.0;
    for (int j = 0; j < k; j++)
    {
        sum += A[x * k + j] * B[n * j + y];
    }
    C[i] = sum;
    printf("The value is %f", C[i]);

}

int main(void)
{
    double A_h[M * K];
    double B_h[K * N];
    double C_h[M * N];
    
    for (int i = 0; i < M*K; i++)
    {
        A_h[i] = (double)i;
        B_h[i] = (double)i;
        C_h[i] = 0.0;
    }

    gpuErrchk(cudaMemcpyToSymbol(A_dev, A_h, M * K * sizeof(double), 0, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpyToSymbol(B_dev, B_h, K * N * sizeof(double), 0, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpyToSymbol(C_dev, C_h, M * N * sizeof(double), 0, cudaMemcpyHostToDevice));

    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    dim3 dimGrid(1, 1, 1);
    dim3 dimBlock(3, 3, 1);
    gemm <<<dimGrid, dimBlock >>> (A_dev, B_dev, C_dev, 3, 3, 3);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpyFromSymbol(C_h, C_dev, M * N * sizeof(double), 0, cudaMemcpyDeviceToHost));

    return 0;
}

【问题讨论】:

    标签: cuda gpu memory-access


    【解决方案1】:

    当使用__device__ 变量时,它们本质上是在全局范围内,我们不会将它们作为内核参数传递。您可以直接在内核代码中使用这些变量,而不必为它们提供内核参数。

    如果您对代码进行以下更改,它将正常运行:

    #include <iostream>
    #include <cmath>
    #include <stdio.h>
    #include <stdlib.h>
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
    {
        if (code != cudaSuccess)
        {
            fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            if (abort) exit(code);
        }
    }
    
    #define M 3
    #define N 3
    #define K 3
    
    using namespace std;
    
    __device__ double A_dev[M * K];
    __device__ double B_dev[K * N];
    __device__ double C_dev[M * N];
    
    __global__ void gemm(int m, int n, int k)
    {
        int x = blockDim.x * blockIdx.x + threadIdx.x;
        int y = blockDim.y * blockIdx.y + threadIdx.y;
    
        int i = x * n + y;
        
        double sum = 0.0;
        for (int j = 0; j < k; j++)
        {
            sum += A_dev[x * k + j] * B_dev[n * j + y];
        }
        C_dev[i] = sum;
        printf("The value is %f", C_dev[i]);
    
    }
    
    int main(void)
    {
        double A_h[M * K];
        double B_h[K * N];
        double C_h[M * N];
        
        for (int i = 0; i < M*K; i++)
        {
            A_h[i] = (double)i;
            B_h[i] = (double)i;
            C_h[i] = 0.0;
        }
    
        gpuErrchk(cudaMemcpyToSymbol(A_dev, A_h, M * K * sizeof(double), 0, cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpyToSymbol(B_dev, B_h, K * N * sizeof(double), 0, cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpyToSymbol(C_dev, C_h, M * N * sizeof(double), 0, cudaMemcpyHostToDevice));
    
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    
        dim3 dimGrid(1, 1, 1);
        dim3 dimBlock(3, 3, 1);
        gemm <<<dimGrid, dimBlock >>> (3, 3, 3);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    
        gpuErrchk(cudaMemcpyFromSymbol(C_h, C_dev, M * N * sizeof(double), 0, cudaMemcpyDeviceToHost));
    
        return 0;
    }
    

    【讨论】:

    • 它现在按预期工作。谢谢罗伯特的帮助!!
    猜你喜欢
    • 2020-12-27
    • 2015-05-02
    • 2022-01-18
    • 2015-07-23
    • 2021-08-12
    • 1970-01-01
    • 2021-11-22
    • 2015-06-28
    • 2017-01-29
    相关资源
    最近更新 更多