【问题标题】:Launch Bounds not doing its job?Launch Bounds 没有发挥作用?
【发布时间】:2014-07-26 21:38:34
【问题描述】:

这是我在下面发布的内核的视觉分析器结果。注意网格 (1) 和块 (1024) 的大小,以及它在应该使用 64 个寄存器时如何只使用 43 个寄存器。我使用的是特斯拉 k40c。

#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include "curand.h"
#include <cuda_runtime.h>
#include "math.h"
#include <curand_kernel.h>
#include <time.h>
#include <algorithm>
#include <iostream>

#define iterations 159744
#define transMatrixSize 2592 // Just for clarity. Do not change. 
#define reps 1024 // Is equal to blocksize. Do not change 
#define integralStep 13125  // Number of time steps to be averaged at the tail 

__global__ void
__launch_bounds__(1024,1) 
bufferleech(float *masterForces, float *masterForces50, const float * __restrict__ transMatrix, const float *rands, const int r_max)
{

int globalIdx = ((blockIdx.x + (blockIdx.y * gridDim.x)) * (blockDim.x * blockDim.y)) + (threadIdx.x + (threadIdx.y * blockDim.x));


curandState s;
curand_init (rands[globalIdx] , 0, 0, &s);
float r = 0.0;

volatile __shared__ float buffer[reps]; 
volatile __shared__ float buffer50[reps]; 


int RU[26] = {0};

for(int e =1; e< 25; e++)
{
r = curand_uniform(&s); 
   if(r < .5)
   {
       RU[e] += 10;
   } 
}

int index = 0;
float temp = 0;
float temp50 = 0;

int RUsnapshot = 0; 
int leftsnap = 0;
int RUsnapshot50 = 0; 
int leftsnap50 = 0;



for (int i =0; i < iterations; i++) 
{

leftsnap = 0;
leftsnap50 = 0;
/////////////////////////////////////////XYZ: [100% state][50%Binary][50% state]//////////////////////////////////////
        for(int j = 1; j < 25; j++)
        {
            r = curand_uniform(&s);
            RUsnapshot = int(RU[j]/100);

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4) * dimen5) ;
            index =  ((((leftsnap * 6 + int(RU[j+1]/100)) * 6  + int(RU[j]/100)) * 2) * 6) ;

            RU[j]+= 100 * (( r < transMatrix[index]) * (transMatrix[index + 1]) +
                (! (r < transMatrix[index])) * ( r < transMatrix[index + 2]) * (transMatrix[index + 3]) +
                (! ( r < transMatrix[index + 2])) * (r < transMatrix[index + 4]) * (transMatrix[index + 5])) ;

            leftsnap = RUsnapshot;
            //-------------------------------------xTnC 50----------------------------
            RUsnapshot50 = (RU[j] % 10);

            //index =  ((((left[j] * dimen2 + right[j]) * dimen3  + RU[j +1 ]) * dimen4 + xTnC?) * dimen5) ;
            index =  ((((leftsnap50 * 6 + (RU[j+1] % 10)) * 6  + (RU[j] % 10) ) * 2 + int((RU[j] % 100)/10)) * 6) ;

            RU[j]+= ( r < transMatrix[index]) * (transMatrix[index + 1]) +
                (! (r < transMatrix[index])) * ( r < transMatrix[index + 2]) * (transMatrix[index + 3]) +
                (! ( r < transMatrix[index + 2])) * (r < transMatrix[index + 4]) * (transMatrix[index + 5]) ;

            leftsnap50 = RUsnapshot50;
        }


        ///////////////////////////////////////////////////////////   

         for(int z = 1; z < 25; z++)
        {
            temp+= ((int(RU[z]/100.0)) ==4) + ((int(RU[z]/100.0)) ==5);
            temp50+= ((RU[z] % 10) ==4) + ((RU[z] % 10) ==5);
        }

           buffer[globalIdx] = temp;
           buffer50[globalIdx] = temp50;

  __syncthreads();

   for (int b = 0; b < 10; b++)
   {
      if ((globalIdx % int(powf(2, (b+1)))) == 0)
      {
      buffer[globalIdx] += buffer[globalIdx + int(powf(2,b))];
      buffer50[globalIdx] += buffer50[globalIdx + int(powf(2,b))];
      if(b ==9)
      {
          masterForces[i] = buffer[0]/24576.0;
          masterForces50[i] = buffer50[0]/24576.0;
      }

      }
   }

        temp = 0.0;
        temp50 = 0.0;
}



}

如何让这个内核使用 64 个寄存器? 肯定有更多寄存器使用的空间,因为我编写的类似内核使用启动边界命令获得多达 116 个寄存器没有问题。

谢谢

如果你想自己运行,这里是 main 函数:

    int main()
    {
    srand((unsigned)time(NULL)); 
    cudaSetDevice(0);

    cudaStream_t s6;
    cudaStreamCreate(&s6);

         float tm[transMatrixSize] = {0.068571, 1, 0.069143, 2.000000, 0, 0, 0, 0, 0.069143, 2.000000, 0, 0, 0.003810, 2.000000, 0.004670, -1, 0, 0, 0.003810, 2.000000, 0, 0, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.143315, 1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.074872, -1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.069143, 2.000000, 
    0, 0, 0, 0, 0.069143, 2.000000, 0, 0, 0.003810, 2.000000, 0.004670, -1, 0, 0, 0.003810, 2.000000, 0, 0, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.143315, 1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.074872, -1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 
    2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 
    0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 
    2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 
    0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.069143, 2.000000, 0, 0, 0, 0, 0.069143, 2.000000, 0, 0, 0.003810, 2.000000, 0.004670, -1, 0, 0, 0.003810, 2.000000, 0, 0, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.143315, 
    1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.074872, -1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.069143, 2.000000, 0, 0, 0, 0, 0.069143, 2.000000, 0, 0, 0.003810, 2.000000, 0.004670, -1, 0, 0, 0.003810, 2.000000, 0, 0, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.143315, 1, 0, 2.000000, 0.074743, -2.000000, 
    0, 0, 0, 2.000000, 0.074743, -2.000000, 0.074872, -1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, 
    -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 
    0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 
    1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 
    0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 
    0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 
    0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.085714, 2.000000, 0, 0, 0, 0, 0.085714, 2.000000, 0, 0, 0.114286, 2.000000, 0.115147, -1, 0, 0, 0.114286, 2.000000, 0, 0, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.071084, 1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.002642, -1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 
    0, 0, 0, 0.068571, 1, 0.085714, 2.000000, 0, 0, 0, 0, 0.085714, 2.000000, 0, 0, 0.114286, 2.000000, 0.115147, -1, 0, 0, 0.114286, 2.000000, 0, 0, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.071084, 1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.002642, -1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 
    0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 
    0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 
    0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 
    0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.085714, 2.000000, 0, 0, 0, 0, 0.085714, 2.000000, 0, 0, 0.114286, 2.000000, 0.115147, -1, 0, 0, 0.114286, 2.000000, 0, 0, 
    0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.071084, 1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.002642, -1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.085714, 2.000000, 0, 0, 0, 0, 0.085714, 2.000000, 0, 0, 0.114286, 2.000000, 0.115147, -1, 0, 0, 0.114286, 2.000000, 0, 0, 0, 0, 0.000021, 2.000000, 0.002513, 
    -2.000000, 0.071084, 1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.002642, -1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 
    0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 
    2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 
    0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 
    0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 
    0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, 
    -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.188571, 2.000000, 0, 0, 0, 0, 0.188571, 2.000000, 0, 0, 0.800000, 2.000000, 0.800861, -1, 0, 0, 0.800000, 2.000000, 0, 0, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.069013, 1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.000570, -1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 
    0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.188571, 2.000000, 0, 0, 0, 0, 0.188571, 2.000000, 0, 0, 0.800000, 2.000000, 0.800861, -1, 0, 0, 0.800000, 2.000000, 0, 0, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.069013, 1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.000570, -1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, 
    -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 
    0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 
    0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 
    0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.188571, 2.000000, 0, 0, 0, 0, 0.188571, 2.000000, 0, 0, 0.800000, 2.000000, 0.800861, 
    -1, 0, 0, 0.800000, 2.000000, 0, 0, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.069013, 1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.000570, -1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.188571, 2.000000, 0, 0, 0, 0, 0.188571, 2.000000, 0, 0, 0.800000, 2.000000, 0.800861, -1, 0, 0, 0.800000, 2.000000, 
    0, 0, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.069013, 1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.000570, -1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0};



   float *h_F6 = new float[(iterations)];
    float *h_F50_6 = new float[(iterations)];
    float h_S6 [reps]; for (int i = 0; i < reps; i++) {h_S6 [i] = 0;}; for (int i = 0; i < reps; i++){h_S6 [i] = float(rand());}
    // Device input vectors
    float *d_F6 ;
    float *d_F50_6 ;
    float *d_S6 ;
    float *d_TM6 ;
    // Allocate memory for each vector on GPU
    cudaMalloc((void**)&d_F6 , iterations * sizeof(float));
    cudaMalloc((void**)&d_F50_6 , iterations * sizeof(float));
    cudaMalloc((void**)&d_S6 , reps * sizeof(float));
    cudaMalloc((void**)&d_TM6 , transMatrixSize * sizeof(float));
    // Copy host vectors to device
    cudaMemcpyAsync( d_S6 , h_S6 , reps * sizeof(float), cudaMemcpyHostToDevice, s6 );
    cudaMemcpyAsync( d_TM6 , tm , transMatrixSize * sizeof(float), cudaMemcpyHostToDevice, s6 );

    bufferleech<<<1, 1024, 0, s6 >>>( d_F6 , d_F50_6 , d_TM6 , d_S6 , reps);

    cudaMemcpyAsync( h_F6 , d_F6 , iterations * sizeof(float), cudaMemcpyDeviceToHost, s6 );
    cudaMemcpyAsync( h_F50_6 , d_F50_6 , iterations * sizeof(float), cudaMemcpyDeviceToHost, s6 );

    delete h_F6; delete h_F50_6;

    cudaDeviceReset();
    return 0;
    }

【问题讨论】:

  • 你检查过生成的代码吗?使用更多的寄存器并不意味着算法运行得更快:如果 CUDA 编译器不使用它们,则意味着它不能或不需要那么多。您的代码中有很多循环,编译器可能已经能够优化一些东西并将一些东西保留在寄存器中。针对“另一个”分析您的内核,看看您是否有收获或损失。我会更关心低占用率,也许你在共享内存上有银行冲突。使用 Nsight 进行评估。
  • 感谢您的回复。如何检查生成的代码?如果你的意思是低级代码,我不知道该怎么做?
  • IIRC 有一个 cuobj-dump 实用程序可以解决问题。

标签: c++ cuda launch cpu-registers bounds


【解决方案1】:

launch bounds (间接)设置每个线程的寄存器上限。它不会强制编译器在每个线程中使用特定数量的寄存器。

如果编译器每个线程只需要 46 个寄存器,启动边界不会强制它使用更多。

您编写了使用更多的“相似”内核这一事实并不能说明什么。内核代码中的无害更改可能会导致寄存器使用情况大不相同。

如果您希望程序运行得更快,请使用 1 个以上的线程块。这是一个比努力增加每个线程的寄存器使用量更重要的优化目标。

如果您愿意,可以使用binary utilities 检查生成的机器代码。但是,除了确认之外,这对于理解每个线程的寄存器使用没有太大的指导意义。

【讨论】:

  • 感谢您的回复。增加线程块大小不是一种选择,因为代码需要保留在一个块中,以便线程可以通过共享内存进行通信。你知道在我的内核中增加寄存器的其他方法吗?
  • 这是一个奇怪的要求。专注于优化您的代码。如果编译器找到合理的理由这样做,它将使用更多的寄存器(直到启动边界的隐含最大值)。总而言之,拒绝使用多个线程块的想法是将您的代码限制在 GPU 中可用性能的一小部分。有许多使用共享内存的代码并不局限于一个线程块。您可能需要考虑如何对算法进行分区,以使各个部分可以并行运行,即使这意味着复制共享内存数据。
猜你喜欢
  • 1970-01-01
  • 1970-01-01
  • 2017-03-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 2022-01-09
  • 1970-01-01
  • 1970-01-01
相关资源
最近更新 更多