cudaFree - 无效的设备指针错误答案

【问题标题】：cudaFree - Invalid device pointer errorcudaFree - 无效的设备指针错误
【发布时间】：2014-10-22 22:52:48
【问题描述】：

我正在尝试取消分配我在 CUDA + OpenGL 互操作代码中分配的设备内存 dev_inp。经过错误检查，我收到了Invalid Device Pointer 错误，并且程序在我的renderScene() 函数结束时的cudaFree(dev_inp); 调用处停止执行。一切都很好，但我担心内存泄漏。

问题：

一个。为什么我无法释放已分配的本地设备内存？我从像素缓冲区对象中取消映射cuda_resource，并取消注册资源。

来自 CUDA C 编程指南中的 B.17 节：

Memory allocated via malloc() cannot be freed using the runtime (i.e. by calling any of the free memory functions from Sections 3.2.2).

所以，这让我想到另外两个问题：

b.我在内核中没有malloced 内存，因为我没有。那么，利用cudaFree 函数应该（技术上？）在这里工作对吗？是否由程序员解除分配给本地定义的指针的内存，或者 nvcc 编译器是否在程序退出或超出本地范围时处理解除分配？我不希望我的代码中出现内存泄漏，因此通过处理释放我之前分配的内存我感觉更安全。

c。在 renderScene() 函数的末尾调用 cudaDeviceReset() 是否谨慎，以便销毁主要的 CUDA 上下文（以及它的变量和指针，根据 CUDA C 编程指南）？我看到 NVidia Visual Profiler 文档也提到了这一点：cudaDeviceReset() 当我调用它时，渲染似乎比平时慢。如果我可以在这里简单地cudaFree 内存，那就太好了，但我似乎无法让它工作。

完整代码：

#define GET_PROC_ADDRESS( str ) wglGetProcAddress( str )

GLuint tex; 
GLuint pbo;
struct cudaGraphicsResource *cuda_resource;    

PFNGLBINDBUFFERARBPROC    glBindBuffer     = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers  = NULL;
PFNGLGENBUFFERSARBPROC    glGenBuffers     = NULL;
PFNGLBUFFERDATAARBPROC    glBufferData     = NULL;

// ==========================================================================================
// CUDA ERROR CHECKING CODE
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) getchar();
   }
}

// ==========================================================================================

void initCUDADevice() { 

    gpuErrchk(cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ));    

}

// ==========================================================================================

void changeSize(int w, int h) {

    //cudaDeviceReset();
    //initCUDADevice();

    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
    glLoadIdentity();

    // Prevent a divide by zero, when window is too short
    // (you cant make a window of zero width).
    if (h == 0)
        h = 1;

    float ratio =  w * 1.0 / h;

    // Use the Projection Matrix
    glMatrixMode(GL_PROJECTION);

    // Reset Matrix
    //glLoadIdentity();

    //// Set the viewport to be the entire window
    glViewport(0, 0, w, h);

    //// Get Back to the Modelview
    glMatrixMode(GL_MODELVIEW);
}

// ==========================================================================================

void renderScene(void) {

    // Clear Color and Depth Buffers
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
    // Reset transformations
    glLoadIdentity();

    // ====================================================================================
    // initiate GPU by setting it correctly 
    //initCUDADevice(); 

    // ====================================================================================
    // read the image that needs to be textured 

    Mat image, flipped;
    image = imread("K:/Ultrasound experiment images/PA_175.png", CV_LOAD_IMAGE_GRAYSCALE);   // Read the file from disk

    if(!image.data)                              // Check for invalid input
    {
        cout <<  "Could not open or find the image" << std::endl ;


    }

    cv::flip(image, flipped, 0);

    imshow("OpenCV - image", image);    // displays output

    // ====================================================================================
    // allocate the PBO, texture, and CUDA resource

    glBindBuffer    = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
    glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
    glGenBuffers    = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
    glBufferData    = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");

    // ====================================================================================
    // generate the pixel buffer object (PBO)

    // Generate a buffer ID called a PBO (Pixel Buffer Object)
    glGenBuffers(1, &pbo);

    // Make this the current UNPACK buffer (OpenGL is state-based)
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);

    // Allocate data for the buffer. 4-channel 8-bit image
    glBufferData(GL_PIXEL_UNPACK_BUFFER, sizeof(unsigned char) * flipped.rows * flipped.cols, NULL, GL_STREAM_DRAW);
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

    gpuErrchk(cudaGraphicsGLRegisterBuffer(&cuda_resource, pbo, cudaGraphicsMapFlagsNone)); 

    // ====================================================================================
    // create the texture object 

    // enable 2D texturing
    glEnable(GL_TEXTURE_2D);

    // generate and bind the texture    
    glGenTextures(1, &tex);
    glBindTexture(GL_TEXTURE_2D, tex);

    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);

    // put flipped.data at the end for cpu rendering 
    glTexImage2D(GL_TEXTURE_2D, 0, GL_LUMINANCE,  image.cols, image.rows,  0, GL_LUMINANCE, GL_UNSIGNED_BYTE, 0 );

    // put tex at the end for cpu rendering 
    glBindTexture(GL_TEXTURE_2D, 0);

    // ====================================================================================
    // copy OpenCV flipped image data into the device pointer

    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

    unsigned char *dev_inp; 

    gpuErrchk( cudaMalloc((void**)&dev_inp, sizeof(unsigned char)*flipped.rows*flipped.cols) );

    gpuErrchk( cudaGraphicsMapResources(1, &cuda_resource, 0) );

    size_t size; 
    gpuErrchk( cudaGraphicsResourceGetMappedPointer((void **)&dev_inp, &size, cuda_resource) );

    gpuErrchk( cudaMemcpy(dev_inp, flipped.data, sizeof(unsigned char)*flipped.rows*flipped.cols, cudaMemcpyHostToDevice) );

    gpuErrchk( cudaGraphicsUnmapResources(1, &cuda_resource, 0) ); 

    // ====================================================================================
    // bind pbo and texture to render data now 

    glBindBuffer( GL_PIXEL_UNPACK_BUFFER, pbo);
    //
    glBindTexture(GL_TEXTURE_2D, tex);

    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, flipped.cols, flipped.rows, GL_LUMINANCE, GL_UNSIGNED_BYTE, NULL);

    gpuErrchk( cudaGraphicsUnregisterResource(cuda_resource));
    gpuErrchk( cudaThreadSynchronize());

    //gpuErrchk(cudaFree(dev_inp));

    // ====================================================================================
    // map the texture coords to the vertex coords 

    glBegin(GL_QUADS);
    // Front Face
    glTexCoord2f(0.0f, 0.0f); glVertex3f(-1.0f, -1.0f,  1.0f);  // Bottom Left Of The Texture and Quad
    glTexCoord2f(1.0f, 0.0f); glVertex3f( 1.0f, -1.0f,  1.0f);  // Bottom Right Of The Texture and Quad
    glTexCoord2f(1.0f, 1.0f); glVertex3f( 1.0f,  1.0f,  1.0f);  // Top Right Of The Texture and Quad
    glTexCoord2f(0.0f, 1.0f); glVertex3f(-1.0f,  1.0f,  1.0f);  // Top Left Of The Texture and Quad

    glEnd();

    glFlush();  // force rendering

    glDisable(GL_TEXTURE_2D);

    //glutSwapBuffers();
    gpuErrchk(cudaFree(dev_inp));        // <--- Error here
    //cudaGraphicsUnregisterResource(cuda_resource);

}


// ==========================================================================================


int main(int argc, char **argv) {


    // init GLUT and create window
    glutInit(&argc, argv);
    glutInitDisplayMode(GLUT_DEPTH | GLUT_RGB );
    glutInitWindowPosition(100,100);
    glutInitWindowSize(1024,256);
    glutCreateWindow("CUDA + OpenGL interop");


    // register callbacks
    glutDisplayFunc(renderScene);
    glutReshapeFunc(changeSize);
    //glutIdleFunc(renderScene);

    // enter GLUT event processing cycle
    glutMainLoop();

    return 1;
}

【问题讨论】：

标签： opengl cuda

【解决方案1】：

此行不是必需的，应从您的代码中删除：

gpuErrchk( cudaMalloc((void**)&dev_inp, sizeof(unsigned char)*flipped.rows*flipped.cols) );

这一行创建一个设备分配，并将该分配的指针分配给dev_inp。

问题出现在这里：

gpuErrchk( cudaGraphicsResourceGetMappedPointer((void **)&dev_inp, &size, cuda_resource) );

此行获取一个 new 指针，该指针源自 cuda_resource 对象，指向另一个不同的分配，并将该指针放入 dev_inp，覆盖您以前的分配的指针（来自cudaMalloc）。在这一行中获取的新指针已经有一个底层设备分配。此时您不需要单独/额外分配它。

此时，如果你尝试释放dev_inp：

gpuErrchk(cudaFree(dev_inp));        // <--- Error here

您正在尝试释放您的程序未明确分配的数据（通过cudaMalloc），并且是持久性（此时）cuda_resource 对象的必要组件。你不想那样做。不幸的是，放置在dev_inp 中的原始指针现在丢失（被覆盖），因此无法在程序中“释放”它，只要程序正在执行，就会发生内存泄漏。

解决方案是不执行额外的、不需要的分配：

gpuErrchk( cudaMalloc((void**)&dev_inp, sizeof(unsigned char)*flipped.rows*flipped.cols) );

这意味着相应的cudaFree 操作也应该被消除：

gpuErrchk(cudaFree(dev_inp));        // <--- Error here

在程序实际退出之前，我不会在 CUDA 代码的任何地方使用cudaDeviceReset，尤其是 CUDA/OpenGL 代码。在其他一些非常特殊的情况下，您可能希望在实际打算退出程序之前使用cudaDeviceReset，但它们不适用于此处。

【讨论】：

我明白了，现在说得通了，谢谢！但是，如果cudaGraphicsResourceGetMappedPointer() 根据文档here 返回指向 cuda_resource 的指针，我可以释放该内存吗？调用cudaGraphicsUnregisterResource()会释放内存吗？
您不想释放内存。 underlying 资源是一个 OpenGL 资源（在这种情况下），您不想在 CUDA 代码中释放它。它应该由 OpenGL 管理。研究cudaGraphicsMapResources 的定义和行为具体来说，由于指针（由cudaGraphicsResourceGetMappedPointer 放置在dev_inp 中）不是通过调用cudaMalloc 分配的，因此您不能通过以下方式释放它致电cudaFree。
哦，对了！ OpenGL 是实际内存的“所有者”，CUDA 只是共享对该内存位置开始处存在的数据的访问。感谢您清除它！