循环中的 PyOpenCl 内核使 GPU 崩溃答案

【问题标题】：PyOpenCl Kernel in Loop Crashes GPU循环中的 PyOpenCl 内核使 GPU 崩溃
【发布时间】：2017-03-01 20:25:55
【问题描述】：

我正在编写一个使用 pypopencl 蛮力的邻居查找例程。稍后它将适合我的平滑粒子水力代码。蛮力当然不是有效的，但它很简单，也是一个起点。我一直在测试我的查找内核，我发现当我在循环中运行它时它会崩溃。我在 python 中没有收到任何错误消息，但屏幕闪烁，然后返回显示图形驱动程序失败但已恢复的注释。奇怪的是，如果搜索的粒子数量很小（~1000 或更少），它就可以了。如果我增加计数（~10k）它会崩溃。我尝试添加障碍和等待命令以及完成命令，但无济于事。我检查了我是否有一个数组溢出，但我找不到它。我包括了相关代码，并为它的大小预先道歉，但我想把它全部公开，以便人们可以查看它。我希望有人可以运行它并重新创建错误，或者告诉我哪里出错了。我的设置是使用 spyder 的 python 3.5 并安装了 pyopencl 2016.1。

谢谢，

赛斯

首先是主文件

import numpy as np
import gpuParameters as gpuParameters
import pyopencl as cl
import pyopencl.array as ar
from BruteForceSearch import BruteForceSearch
import time as time

dim = 3  # dimensions of the problem
n = 15000  # number of particles
nbs = 50   # number of neighbors
x = np.random.rand(n)  # randomly choose some x
y = np.random.rand(n)  # randomly choose some y
z = np.random.rand(n)  # randomly choose some z
h = np.ones(n)  # smoothing parameter for the b spline

# setup gpu context
gpu = gpuParameters.gpuParameters()
# neighbor list
nlist = -1*np.ones(n*nbs, dtype=np.int32)

# data to gpu
xg = ar.to_device(gpu.queue, x)  # x pos on gpu
yg = ar.to_device(gpu.queue, y)  # y pos on gpu
zg = ar.to_device(gpu.queue, z)  # z pos on gpu
hg = ar.to_device(gpu.queue, h)  # h pos on gpu
num_p = ar.to_device(gpu.queue, np.array(n, dtype=np.int32))  # num of particles
nb = ar.to_device(gpu.queue, np.array(nbs, dtype=np.int32))  # num of neighbors
nlst = ar.to_device(gpu.queue, nlist)  # neighbor list on gpu
dg = ar.to_device(gpu.queue, np.array(dim, dtype=np.int32))  # dimension on gpu
out = ar.zeros(gpu.queue, n, np.float64)  # debug parameter

# call the Brute force neighbor search and h parameter set class
srch = BruteForceSearch(gpu)  # instatiate
s = time.time()  # timer start
for ii in range(100):
    # set a marker I really didn't think this would be necessary
    mark = cl.enqueue_marker(gpu.queue)  # set a marker for kernel complete
    srch.search.search(gpu.queue, x.shape, None,
                       num_p.data, nb.data, dg.data, xg.data, yg.data, zg.data,
                       hg.data, nlst.data, out.data)  # run the kernel
    cl.Event.wait(mark)  # wait for complete run of kernel before next iteration
    # gpu.queue.finish()
    print('iteration: ', ii)  # print iteration time to show me its running
e = time.time()  # end the timer
cs = time.time()  # clock the time it takes to return the array
nlist = nlst.get()
ce = time.time()
# output the times
print('time to calculate: ', e-s)
print('time to copy back: ', ce - cs)

GPU 上下文类

import pyopencl as cl

class gpuParameters:
    def __init__(self, dType  = []):
        #will setup the proper context based on given device preference
        #if no device perference given will default to first value
        if dType == []:
            pltfrms = cl.get_platforms()[0]
            devices = pltfrms.get_devices(cl.device_type.GPU)
            context = cl.Context(devices) #create a device context
        print(context)
        print(devices)
        self.cntxt = context#keep this context in motion
        self.queue = cl.CommandQueue(self.cntxt) #create a command que for this context
        self.mF = cl.mem_flags

邻居循环

import numpy as np
import pyopencl as cl
import gpu_sph_assistance_functions as gsaf

class BruteForceSearch:
    def __init__(self, gpu):
        # instantiation of the search routine primarilly for pre compiling of
        #  the function
        self.gpu = gpu  # save the gpu context

        # setup and compile the search
        self.bruteSearch()

def bruteSearch(self):
    W = gsaf.gpu_sph_kernel()
    self.search = cl.Program(
        self.gpu.cntxt,
        W + '''__kernel void search(__global int *nP, __global int *nN,
                                __global int *dim,
                            __global double *x, __global double *y,
                            __global double *z, __global double *h,
                            __global int *nlist, __global double *out)
        {
            // indices
            int gid = get_global_id(0);  // current particle
            int idv = 0;  // unrolled array id
            int count = 0;  // count
            int dm = *dim;  // problem dimension
            int itr = 0;  // start iteration
            int mxitr = 25;  // max number of iterations
            // calculate variables
            double dms = 1.0/(*dim);  // 1 over dimension for pow
            double xi = x[gid];  // current x position
            double yi = y[gid];  // current y position
            double zi = z[gid];  // current z position
            double dx = 0;  // difference in x
            double dy = 0;  // difference in y
            double dz = 0; // difference in z
            double r = 0;  // radius
            double hg = h[gid];  // smoothing parametre
            double Wsum = 0; // sum of weights
            double W = 0;  // current weight
            double dwdx = 0;  // derivative of weight in x direction
            double dwdy = 0;  // derivative of weight in y direction
            double dwdz = 0;  // derivative of weight in z direction
            double dwdr = 0;  // derivative of weight in r direction
            double V = 0;  // Volume of particle
            double hn = 0;  // holding value for comparison
            double err = 10;  // error
            double tol = 1e-7; // tolerance
            double diff = 0;  // difference

            // first clean the array of neighbors
            for (int ii = 0; ii < *nN; ii++)  // length of num of neighbors
            {
                idv = *nN*gid + ii;  // unrolled index
                nlist[idv] = -1; // this is a trigger for excluding values
            }
            // Next calculate the h parameter
           while (err > tol)
            {
                Wsum = 0; // clean summation
                for (int jj = 0; jj < *nP; jj++)  // loop over all particles
                {
                    dx = xi - x[jj];
                    dy = yi - y[jj];
                    dz = zi - z[jj];
                    // spline for weights
                    quintic_spline(dm, hg, dx, dy, dz, &W,
                                   &dwdx, &dwdy, &dwdz, &dwdr);
                    Wsum += W;  // add to store
                 }
                V = 1.0/Wsum;  /// volume
                hn = pow(V, dms);  // new h parameter
                diff = hn - hg;  // difference
                err = fabs(diff);  // error
                out[gid] = err;  // store error for debug
                hg = hn; // reset h
                itr ++;  // update iter
                if (itr > mxitr)  // break out
                {   break; }
            }
           h[gid] = hg;  // store h

            /*  // get all neighbors in vicinity of particle not
             // currently assessed
            for(int ii = 0; ii < *nP; ii++)
            {
                dx = xi - x[ii];
                dy = yi - y[ii];
                dz = zi - z[ii];
                r = sqrt(dx*dx + dy*dy + dz*dz);
                if (r < 3.25*hg & count < *nN)
                {
                    idv = *nN*gid + count;
                    nlist[idv] = ii;
                    count++;
                }
            }
            */

    }
        ''').build()

用于加权的样条函数

W = '''void quintic_spline(
        int dim, double h, double dx, double dy, double dz, double *W,
        double *dWdx, double *dWdy, double *dWdz, double *dWdrO)
        {
            double pi = 3.141592654; // pi
            double m3q = 0; // prefix values
            double m2q = 0; // prefix values
            double m1q = 0; // prefix values
            double T1 = 0; // prefix values
            double T2 = 0; // prefix values
            double T3 = 0; // prefix values
            double D1 = 0; // prefix values
            double D2 = 0; // prefix values
            double D3 = 0; // prefix values
            double Ch = 0; // normalizing parameter for kernel
            double C = 0; // normalizing prior to h
            double r = sqrt(dx*dx + dy*dy + dz*dz);
            double q = r/h; // normalized radius
            double dqdr = 1.0/h; // intermediate derivative
            double dWdq = 0; // intermediate derivative
            double dWdr = 0; // intermediate derivative
            double drdx = dx/r; // intermediate derivative
            double drdy = dy/r; // intermediate derivative
            double drdz = dz/r; // intermediate derivative
            if (dim == 1)
            {
                C = 1.0/120.0;
            }
            else if (dim == 2)
            {
                C = 7.0/(pi*478.0);
            }
            else if (dim == 3)
            {
                C = 1.0/(120.0*pi);
            }
            Ch = C/pow(h, dim);
            if (r <= 0)
            {
                drdx = 0.0;
                drdy = 0.0;
                drdz = 0.0;
            }

            // local prefix constants
            m1q = 1.0 - q;
            m2q = 2.0 - q;
            m3q = 3.0 - q;

            // smoothing parameter constants
            T1 = Ch*pow(m3q, 5);
            T2 = -6.0*Ch*pow(m2q, 5);
            T3 = 15.0*Ch*pow(m1q, 5);

            //derivative of spline coefficients
            D1 = -5.0*Ch*pow(m3q,4);
            D2 = 30.0*Ch*pow(m2q,4);
            D3 = -75.0*Ch*pow(m1q,4);

            // W calculation
            if (q < 1.0)
            {
                *W = T1  + T2 + T3;
                dWdq = D1 + D2 + D3;
            }
            else if (q >= 1.0 && q < 2.0)
            {
                *W = T1  + T2;
                dWdq = D1 + D2;
            }
            else if (q >= 2.0 && q < 3.0)
            {
                *W = T1;
                dWdq = D1;
            }
            else
            {
                *W = 0.0;
                dWdq = 0.0;
            }
            dWdr = dWdq*dqdr;
            // assign the derivatives
            *dWdx = dWdr*drdx;
            *dWdy =  dWdr*drdy;
            *dWdz =  dWdr*drdz;
            *dWdrO = dWdr;
        }'''

【问题讨论】：

标签： python-3.x pyopencl

【解决方案1】：

我在采用 AMD 加速并行处理的 Intel i7-4790K CPU 上测试了代码。它不会在 n=150000 时崩溃（我只运行一次迭代）。我在快速查看代码时发现的唯一奇怪的事情是内核正在对数组 h 进行读写。这应该不是问题，但我通常还是尽量避免这种情况。

【讨论】：

我将返回并尝试将其作为两个单独的数组，一个用于输入，一个用于输出。可能会解决问题。感谢您的关注。