【发布时间】:2022-01-18 21:32:14
【问题描述】:
我对 cuda 还很陌生,我想使用常量内存的概念,但是在运行代码时遇到了非法内存访问。
我的内核是这样的
__global__ void nonceKernel(int inLen, int shaTermLength, BYTE* outSha1, BYTE* outNonce, int nonceLen, int* finishedFlag, int *mutex, int size)
{
if(!*finishedFlag) return;
unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
bool found = true;
BYTE tempNonce[2];
BYTE tempSha1[20];
tempNonce[1]=((tid+size) >> 8) & 0x000000FF;
tempNonce[0]=(tid+size) & 0x000000FF;
CUDA_SHA1 ctx;
cuda_sha1_init(&ctx); //init context
cuda_sha1_update(&ctx, device_input_data, inLen); // add input buffer
cuda_sha1_update(&ctx, tempNonce, nonceLen); //add nonce
cuda_sha1_final(&ctx, tempSha1); //compute sha1
for(int i=0; i<shaTermLength; i++) {
if(tempSha1[19 - i] != device_sha1_term[shaTermLength - 1 - i])
found=false;
}
if(found == true) {
lock(mutex);
memcpy(outSha1, tempSha1, 20); //20 bytes for sha1
memcpy(outNonce, tempNonce, nonceLen); //2 bytes for nonce
*finishedFlag = 0;
unlock(mutex);
}
}
我的中介功能是这样的:
cudaError_t nonceWithCuda(int intlen, int shaTermLength, BYTE* outSha1, BYTE* outNonce, int *finishFlag, int nonceLen, int size)
{
BYTE *gpuSha1Out;
BYTE *gpuNonceOut;
int *gpuFinishedFlag;
cudaError_t cudaStatus;
int *mutex;
cudaStatus= cudaSetDevice(0);
if(cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a cuda gpu installed?");
goto Error;
}
....
cudaStatus=cudaMalloc((void**)&gpuFinishedFlag, 1*sizeof(int));
if(cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc for gpuFinishedFlag failed");
goto Error;
}
cudaStatus=cudaMemcpy(gpuFinishedFlag, finishFlag, sizeof(int), cudaMemcpyHostToDevice);
if(cudaStatus!=cudaSuccess) {
fprintf(stderr, "cudamemcpy 0 to gpuFinishedFlag failed!");
goto Error;
}
....
while(*finishFlag) {
nonceKernel<<<128, 1024>>>(intlen, shaTermLength, gpuSha1Out, gpuNonceOut, nonceLen, gpuFinishedFlag, mutex, size);
size++;
cudaStatus=cudaMemcpy(finishFlag, gpuFinishedFlag, sizeof(int), cudaMemcpyDeviceToHost);
if(cudaStatus!=cudaSuccess) {
fprintf(stderr, "cudaMemcpy from gpuFinishedFlag failed, with code: %s!", cudaGetErrorString(cudaStatus));
goto Error;
}
}
......
Error:
cudaFree(gpuSha1Out);
cudaFree(gpuNonceOut);
cudaFree(gpuFinishedFlag);
return cudaStatus;
}
我也这样声明常量变量:
__constant__ BYTE* device_input_data;
__constant__ BYTE* device_sha1_term;
其中 BYTE 定义为无符号字符 typedef unsigned char BYTE;。
最后是 main 函数。
int main(int argc, char** argv) {
size_t input_block_size=5; //bytes
int nonceLen=2;
int finishedFlag=1;
...
BYTE* inputData = (BYTE*) malloc(input_block_size * sizeof(BYTE)); //input byte buffer
inputData[0]=0x23; //#
inputData[1]=0x30; //0
inputData[2]=0x42; //B
inputData[3]=0x69; //i
inputData[4]=0x61; //a
BYTE* shaTerm = (BYTE*) malloc(nonceLen * sizeof(BYTE));
shaTerm[0]=0x7E;
shaTerm[1]=0x46;
int shaTermLength = sizeof(shaTerm)/sizeof(shaTerm[0]);//ouput sha buffer
cudaStatus=cudaMemcpyToSymbol(device_input_data, inputData, input_block_size * sizeof(BYTE), 0, cudaMemcpyHostToDevice);
fprintf(stderr, "MemcpyToSymbol: %s\n", cudaGetErrorString(cudaStatus));
cudaStatus=cudaMemcpyToSymbol(device_sha1_term, shaTerm, shaTermLength * sizeof(BYTE), 0, cudaMemcpyHostToDevice);
fprintf(stderr, "MemcpyToSymbol: %s\n", cudaGetErrorString(cudaStatus));
...
nonceWithCuda(input_block_size, shaTermLength, outputSha1Buffer, outputNonceBuffer, &finishedFlag, 2, size);
错误发生在nonceWithCuda函数的while中,当我将值从gpu复制回主机时,我的意思是这段代码:
cudaStatus=cudaMemcpy(finishFlag, gpuFinishedFlag, sizeof(int), cudaMemcpyDeviceToHost);
if(cudaStatus!=cudaSuccess) {
fprintf(stderr, "cudaMemcpy from gpuFinishedFlag failed, with code: %s!", cudaGetErrorString(cudaStatus));
goto Error;
}
输出:
$ ./nonce_v3
MemcpyToSymbol: no error
MemcpyToSymbol: no error
cudaMemcpy from gpuFinishedFlag failed, with code: an illegal memory access was encountered!
请注意,当我没有对这两个变量使用常量并且无法理解原因时,相同的代码可以正常工作。有人能指出我正确的方向吗?
感谢您的帮助!!!
【问题讨论】:
-
device_input_data被声明为指针。但是,您不会复制指向它的指针,而是复制一个数组。