CUDA cudaMemcpy，遇到非法内存访问答案

【问题标题】：CUDA cudaMemcpy, an illegal memory access was encounteredCUDA cudaMemcpy，遇到非法内存访问
【发布时间】：2022-01-18 21:32:14
【问题描述】：

我对 cuda 还很陌生，我想使用常量内存的概念，但是在运行代码时遇到了非法内存访问。

我的内核是这样的

__global__ void nonceKernel(int inLen, int shaTermLength, BYTE* outSha1, BYTE* outNonce, int nonceLen, int* finishedFlag, int *mutex, int size)
{
        if(!*finishedFlag) return;

        unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;

        bool found = true;
        BYTE tempNonce[2];
        BYTE tempSha1[20];

        tempNonce[1]=((tid+size) >> 8) & 0x000000FF;
        tempNonce[0]=(tid+size) & 0x000000FF;

        CUDA_SHA1 ctx;

        cuda_sha1_init(&ctx); //init context
        cuda_sha1_update(&ctx, device_input_data, inLen); // add input buffer
        cuda_sha1_update(&ctx, tempNonce, nonceLen); //add nonce
        cuda_sha1_final(&ctx, tempSha1); //compute sha1

        for(int i=0; i<shaTermLength; i++) {
                if(tempSha1[19 - i] != device_sha1_term[shaTermLength - 1 - i])
                        found=false;
        }

        if(found == true) {
                lock(mutex);
                memcpy(outSha1, tempSha1, 20); //20 bytes for sha1
                memcpy(outNonce, tempNonce, nonceLen); //2 bytes for nonce
                *finishedFlag = 0;
                unlock(mutex);
        }
}

我的中介功能是这样的：

cudaError_t nonceWithCuda(int intlen, int shaTermLength, BYTE* outSha1, BYTE* outNonce, int *finishFlag, int nonceLen, int size)
{
        BYTE *gpuSha1Out;
        BYTE *gpuNonceOut;
        int *gpuFinishedFlag;
        cudaError_t cudaStatus;
        int *mutex;

        cudaStatus= cudaSetDevice(0);
        if(cudaStatus != cudaSuccess) {
                fprintf(stderr, "cudaSetDevice failed! Do you have a cuda gpu installed?");
                goto Error;
        }
        ....
        cudaStatus=cudaMalloc((void**)&gpuFinishedFlag, 1*sizeof(int));
        if(cudaStatus != cudaSuccess) {
                fprintf(stderr, "cudaMalloc for gpuFinishedFlag failed");
                goto Error;

        }

        cudaStatus=cudaMemcpy(gpuFinishedFlag, finishFlag, sizeof(int), cudaMemcpyHostToDevice);
        if(cudaStatus!=cudaSuccess) {
                fprintf(stderr, "cudamemcpy 0 to gpuFinishedFlag failed!");
                goto Error;
        }
        ....
        while(*finishFlag) {
                nonceKernel<<<128, 1024>>>(intlen, shaTermLength, gpuSha1Out, gpuNonceOut, nonceLen, gpuFinishedFlag, mutex, size);
                size++;
                cudaStatus=cudaMemcpy(finishFlag, gpuFinishedFlag, sizeof(int), cudaMemcpyDeviceToHost);
                if(cudaStatus!=cudaSuccess) {
                        fprintf(stderr, "cudaMemcpy from gpuFinishedFlag failed, with code: %s!", cudaGetErrorString(cudaStatus));
                        goto Error;
                }
        }
        ......

Error:
        cudaFree(gpuSha1Out);
        cudaFree(gpuNonceOut);
        cudaFree(gpuFinishedFlag);

        return cudaStatus;
}

我也这样声明常量变量：

__constant__ BYTE* device_input_data;
__constant__ BYTE* device_sha1_term;

其中 BYTE 定义为无符号字符 typedef unsigned char BYTE;。

最后是 main 函数。

int main(int argc, char** argv) {

        size_t input_block_size=5; //bytes
        int nonceLen=2;
        int finishedFlag=1;

        ...

        BYTE* inputData = (BYTE*) malloc(input_block_size * sizeof(BYTE)); //input byte buffer
        inputData[0]=0x23; //#
        inputData[1]=0x30; //0
        inputData[2]=0x42; //B
        inputData[3]=0x69; //i
        inputData[4]=0x61; //a
        BYTE* shaTerm = (BYTE*) malloc(nonceLen * sizeof(BYTE));
        shaTerm[0]=0x7E;
        shaTerm[1]=0x46;
        int shaTermLength = sizeof(shaTerm)/sizeof(shaTerm[0]);//ouput sha buffer
        cudaStatus=cudaMemcpyToSymbol(device_input_data, inputData, input_block_size * sizeof(BYTE), 0, cudaMemcpyHostToDevice);
        fprintf(stderr, "MemcpyToSymbol: %s\n", cudaGetErrorString(cudaStatus));
        cudaStatus=cudaMemcpyToSymbol(device_sha1_term, shaTerm, shaTermLength * sizeof(BYTE), 0, cudaMemcpyHostToDevice);
        fprintf(stderr, "MemcpyToSymbol: %s\n", cudaGetErrorString(cudaStatus));
        ...
        nonceWithCuda(input_block_size, shaTermLength, outputSha1Buffer, outputNonceBuffer, &finishedFlag, 2, size);

错误发生在nonceWithCuda函数的while中，当我将值从gpu复制回主机时，我的意思是这段代码：

cudaStatus=cudaMemcpy(finishFlag, gpuFinishedFlag, sizeof(int), cudaMemcpyDeviceToHost);
if(cudaStatus!=cudaSuccess) {
       fprintf(stderr, "cudaMemcpy from gpuFinishedFlag failed, with code: %s!", cudaGetErrorString(cudaStatus));
       goto Error;
}

输出：

$ ./nonce_v3
MemcpyToSymbol: no error
MemcpyToSymbol: no error
cudaMemcpy from gpuFinishedFlag failed, with code: an illegal memory access was encountered!

请注意，当我没有对这两个变量使用常量并且无法理解原因时，相同的代码可以正常工作。有人能指出我正确的方向吗？

感谢您的帮助！！！

【问题讨论】：

device_input_data 被声明为指针。但是，您不会复制指向它的指针，而是复制一个数组。

标签： c++ c cuda

【解决方案1】：

我假设您想将inputData 的 5 个元素存储在常量内存中。

__constant__ BYTE* device_input_data; 行将保留常量内存来存储单个指针。它不会为 5 BYTE 值保留常量内存。

然后，与

cudaMemcpyToSymbol(device_input_data, inputData, input_block_size * sizeof(BYTE), 0, cudaMemcpyHostToDevice);

这个指针指向的内存地址被设置为 inputData 的元素，即在传输之后，指针可以有值0x2330426961000000。这很可能不是设备内存的有效地址。这会在尝试访问内核中的此内存位置时导致观察到的内存错误。

要解决此问题，您需要将常量内存声明为大小为 5 的 BYTE 数组。

__constant__ BYTE device_input_data[5];

【讨论】：

非常感谢！除了您建议的之外，我还进行了一些更改，我还对 device_sha1_term 进行了修改，使其也具有 __constant__ BYTE device_sha1_term [2]; 而这个 BYTE* shaTerm = (BYTE*) malloc(nonceLen * sizeof(BYTE)); shaTerm[0]=0x7E; shaTerm[1]=0x46; 到这个：BYTE shaTerm[]={0x7e, 0x46}; 在运行代码时我收到一个错误，在更改之后以上代码工作正常，非常感谢