【发布时间】:2014-07-26 21:38:34
【问题描述】:
这是我在下面发布的内核的视觉分析器结果。注意网格 (1) 和块 (1024) 的大小,以及它在应该使用 64 个寄存器时如何只使用 43 个寄存器。我使用的是特斯拉 k40c。
#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include "curand.h"
#include <cuda_runtime.h>
#include "math.h"
#include <curand_kernel.h>
#include <time.h>
#include <algorithm>
#include <iostream>
#define iterations 159744
#define transMatrixSize 2592 // Just for clarity. Do not change.
#define reps 1024 // Is equal to blocksize. Do not change
#define integralStep 13125 // Number of time steps to be averaged at the tail
__global__ void
__launch_bounds__(1024,1)
bufferleech(float *masterForces, float *masterForces50, const float * __restrict__ transMatrix, const float *rands, const int r_max)
{
int globalIdx = ((blockIdx.x + (blockIdx.y * gridDim.x)) * (blockDim.x * blockDim.y)) + (threadIdx.x + (threadIdx.y * blockDim.x));
curandState s;
curand_init (rands[globalIdx] , 0, 0, &s);
float r = 0.0;
volatile __shared__ float buffer[reps];
volatile __shared__ float buffer50[reps];
int RU[26] = {0};
for(int e =1; e< 25; e++)
{
r = curand_uniform(&s);
if(r < .5)
{
RU[e] += 10;
}
}
int index = 0;
float temp = 0;
float temp50 = 0;
int RUsnapshot = 0;
int leftsnap = 0;
int RUsnapshot50 = 0;
int leftsnap50 = 0;
for (int i =0; i < iterations; i++)
{
leftsnap = 0;
leftsnap50 = 0;
/////////////////////////////////////////XYZ: [100% state][50%Binary][50% state]//////////////////////////////////////
for(int j = 1; j < 25; j++)
{
r = curand_uniform(&s);
RUsnapshot = int(RU[j]/100);
//index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ;
index = ((((leftsnap * 6 + int(RU[j+1]/100)) * 6 + int(RU[j]/100)) * 2) * 6) ;
RU[j]+= 100 * (( r < transMatrix[index]) * (transMatrix[index + 1]) +
(! (r < transMatrix[index])) * ( r < transMatrix[index + 2]) * (transMatrix[index + 3]) +
(! ( r < transMatrix[index + 2])) * (r < transMatrix[index + 4]) * (transMatrix[index + 5])) ;
leftsnap = RUsnapshot;
//-------------------------------------xTnC 50----------------------------
RUsnapshot50 = (RU[j] % 10);
//index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4 + xTnC?) * dimen5) ;
index = ((((leftsnap50 * 6 + (RU[j+1] % 10)) * 6 + (RU[j] % 10) ) * 2 + int((RU[j] % 100)/10)) * 6) ;
RU[j]+= ( r < transMatrix[index]) * (transMatrix[index + 1]) +
(! (r < transMatrix[index])) * ( r < transMatrix[index + 2]) * (transMatrix[index + 3]) +
(! ( r < transMatrix[index + 2])) * (r < transMatrix[index + 4]) * (transMatrix[index + 5]) ;
leftsnap50 = RUsnapshot50;
}
///////////////////////////////////////////////////////////
for(int z = 1; z < 25; z++)
{
temp+= ((int(RU[z]/100.0)) ==4) + ((int(RU[z]/100.0)) ==5);
temp50+= ((RU[z] % 10) ==4) + ((RU[z] % 10) ==5);
}
buffer[globalIdx] = temp;
buffer50[globalIdx] = temp50;
__syncthreads();
for (int b = 0; b < 10; b++)
{
if ((globalIdx % int(powf(2, (b+1)))) == 0)
{
buffer[globalIdx] += buffer[globalIdx + int(powf(2,b))];
buffer50[globalIdx] += buffer50[globalIdx + int(powf(2,b))];
if(b ==9)
{
masterForces[i] = buffer[0]/24576.0;
masterForces50[i] = buffer50[0]/24576.0;
}
}
}
temp = 0.0;
temp50 = 0.0;
}
}
如何让这个内核使用 64 个寄存器? 肯定有更多寄存器使用的空间,因为我编写的类似内核使用启动边界命令获得多达 116 个寄存器没有问题。
谢谢
如果你想自己运行,这里是 main 函数:
int main()
{
srand((unsigned)time(NULL));
cudaSetDevice(0);
cudaStream_t s6;
cudaStreamCreate(&s6);
float tm[transMatrixSize] = {0.068571, 1, 0.069143, 2.000000, 0, 0, 0, 0, 0.069143, 2.000000, 0, 0, 0.003810, 2.000000, 0.004670, -1, 0, 0, 0.003810, 2.000000, 0, 0, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.143315, 1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.074872, -1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.069143, 2.000000,
0, 0, 0, 0, 0.069143, 2.000000, 0, 0, 0.003810, 2.000000, 0.004670, -1, 0, 0, 0.003810, 2.000000, 0, 0, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.143315, 1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.074872, -1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701,
2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000,
0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205,
2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0,
0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.069143, 2.000000, 0, 0, 0, 0, 0.069143, 2.000000, 0, 0, 0.003810, 2.000000, 0.004670, -1, 0, 0, 0.003810, 2.000000, 0, 0, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.143315,
1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.074872, -1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.069143, 2.000000, 0, 0, 0, 0, 0.069143, 2.000000, 0, 0, 0.003810, 2.000000, 0.004670, -1, 0, 0, 0.003810, 2.000000, 0, 0, 0, 0, 0, 2.000000, 0.074743, -2.000000, 0.143315, 1, 0, 2.000000, 0.074743, -2.000000,
0, 0, 0, 2.000000, 0.074743, -2.000000, 0.074872, -1, 0, 2.000000, 0.074743, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649,
-2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000,
0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571,
1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0,
0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0,
0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1,
0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.085714, 2.000000, 0, 0, 0, 0, 0.085714, 2.000000, 0, 0, 0.114286, 2.000000, 0.115147, -1, 0, 0, 0.114286, 2.000000, 0, 0, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.071084, 1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.002642, -1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0,
0, 0, 0, 0.068571, 1, 0.085714, 2.000000, 0, 0, 0, 0, 0.085714, 2.000000, 0, 0, 0.114286, 2.000000, 0.115147, -1, 0, 0, 0.114286, 2.000000, 0, 0, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.071084, 1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.002642, -1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1,
0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0,
0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0,
0.020866, 2.000000, 0.021727, -1, 0, 0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.071701, 2.000000, 0, 0, 0, 0, 0.071701, 2.000000, 0, 0, 0.020866, 2.000000, 0.021727, -1, 0,
0, 0.020866, 2.000000, 0, 0, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.082221, 1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.000003, 2.000000, 0.013649, -2.000000, 0.013778, -1, 0.000003, 2.000000, 0.013649, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.085714, 2.000000, 0, 0, 0, 0, 0.085714, 2.000000, 0, 0, 0.114286, 2.000000, 0.115147, -1, 0, 0, 0.114286, 2.000000, 0, 0,
0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.071084, 1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.002642, -1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.085714, 2.000000, 0, 0, 0, 0, 0.085714, 2.000000, 0, 0, 0.114286, 2.000000, 0.115147, -1, 0, 0, 0.114286, 2.000000, 0, 0, 0, 0, 0.000021, 2.000000, 0.002513,
-2.000000, 0.071084, 1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.000021, 2.000000, 0.002513, -2.000000, 0.002642, -1, 0.000021, 2.000000, 0.002513, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000,
0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043,
2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1,
0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0,
0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000,
0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660,
-2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.188571, 2.000000, 0, 0, 0, 0, 0.188571, 2.000000, 0, 0, 0.800000, 2.000000, 0.800861, -1, 0, 0, 0.800000, 2.000000, 0, 0, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.069013, 1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.000570, -1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000,
0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.188571, 2.000000, 0, 0, 0, 0, 0.188571, 2.000000, 0, 0, 0.800000, 2.000000, 0.800861, -1, 0, 0, 0.800000, 2.000000, 0, 0, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.069013, 1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.000570, -1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088,
-2.000000, 0, 0, 0, 0, 0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0,
0.068571, 1, 0.076852, 2.000000, 0, 0, 0, 0, 0.076852, 2.000000, 0, 0, 0.055205, 2.000000, 0.056066, -1, 0, 0, 0.055205, 2.000000, 0, 0, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.073735, 1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.000006, 2.000000, 0.005164, -2.000000, 0.005293, -1, 0.000006, 2.000000, 0.005164, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0,
0, 0, 0, 0.113927, 2.000000, 0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.113927, 2.000000, 0, 0, 0, 0, 0.113927, 2.000000,
0, 0, 0.302372, 2.000000, 0.303233, -1, 0, 0, 0.302372, 2.000000, 0, 0, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.069556, 1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.000043, 2.000000, 0.000984, -2.000000, 0.001113, -1, 0.000043, 2.000000, 0.000984, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.188571, 2.000000, 0, 0, 0, 0, 0.188571, 2.000000, 0, 0, 0.800000, 2.000000, 0.800861,
-1, 0, 0, 0.800000, 2.000000, 0, 0, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.069013, 1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.000570, -1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0, 0.068571, 1, 0.188571, 2.000000, 0, 0, 0, 0, 0.188571, 2.000000, 0, 0, 0.800000, 2.000000, 0.800861, -1, 0, 0, 0.800000, 2.000000,
0, 0, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.069013, 1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.000085, 2.000000, 0.000441, -2.000000, 0.000570, -1, 0.000085, 2.000000, 0.000441, -2.000000, 0, 0, 0.068571, 1, 0.068660, -2.000000, 0, 0, 0, 0, 0.068660, -2.000000, 0, 0, 0.000088, -2.000000, 0.000218, -1, 0, 0, 0.000088, -2.000000, 0, 0, 0, 0};
float *h_F6 = new float[(iterations)];
float *h_F50_6 = new float[(iterations)];
float h_S6 [reps]; for (int i = 0; i < reps; i++) {h_S6 [i] = 0;}; for (int i = 0; i < reps; i++){h_S6 [i] = float(rand());}
// Device input vectors
float *d_F6 ;
float *d_F50_6 ;
float *d_S6 ;
float *d_TM6 ;
// Allocate memory for each vector on GPU
cudaMalloc((void**)&d_F6 , iterations * sizeof(float));
cudaMalloc((void**)&d_F50_6 , iterations * sizeof(float));
cudaMalloc((void**)&d_S6 , reps * sizeof(float));
cudaMalloc((void**)&d_TM6 , transMatrixSize * sizeof(float));
// Copy host vectors to device
cudaMemcpyAsync( d_S6 , h_S6 , reps * sizeof(float), cudaMemcpyHostToDevice, s6 );
cudaMemcpyAsync( d_TM6 , tm , transMatrixSize * sizeof(float), cudaMemcpyHostToDevice, s6 );
bufferleech<<<1, 1024, 0, s6 >>>( d_F6 , d_F50_6 , d_TM6 , d_S6 , reps);
cudaMemcpyAsync( h_F6 , d_F6 , iterations * sizeof(float), cudaMemcpyDeviceToHost, s6 );
cudaMemcpyAsync( h_F50_6 , d_F50_6 , iterations * sizeof(float), cudaMemcpyDeviceToHost, s6 );
delete h_F6; delete h_F50_6;
cudaDeviceReset();
return 0;
}
【问题讨论】:
-
你检查过生成的代码吗?使用更多的寄存器并不意味着算法运行得更快:如果 CUDA 编译器不使用它们,则意味着它不能或不需要那么多。您的代码中有很多循环,编译器可能已经能够优化一些东西并将一些东西保留在寄存器中。针对“另一个”分析您的内核,看看您是否有收获或损失。我会更关心低占用率,也许你在共享内存上有银行冲突。使用 Nsight 进行评估。
-
感谢您的回复。如何检查生成的代码?如果你的意思是低级代码,我不知道该怎么做?
-
IIRC 有一个 cuobj-dump 实用程序可以解决问题。
标签: c++ cuda launch cpu-registers bounds