【发布时间】:2021-12-24 15:51:36
【问题描述】:
我正在尝试了解“cudaOccupancyMaxActiveBlocksPerMultiprocessor”方法的用法和好处。
我使用的是sample program present on NVIDIA developer forum 的略微修改版本。 基本上,我要求用户提供数组的大小。
我的 GPU: NVIDIA GeForce GTX 1070
问题:
- 程序返回的占用值非常随机。很多时候,相同的输入数组大小,程序返回不同的占用值,是不是程序有问题?
- 如截图所示,如果用户传入数组size=512,那么占用值为“13”而如果我直接在程序中设置N=512那么占用值为“47”。为什么?
- 为什么用户提供的数组 size=1024 的占用值 =0?
示例代码:
源.cpp
#include "kernel_header.cuh"
#include <algorithm>
#include <iostream>
using namespace std;
int main(int argc, char* argv[])
{
int N;
int userSize = 0;
//ask size to user
cout << "\n\nType the size of 1D Array: " << endl;
cin >> userSize;
N = userSize>0? userSize : 1024; //<<<<<<<<<<<<<<<-------PROBLEM
int* array = (int*)calloc(N, sizeof(int));
for (int i = 0; i < N; i++)
{
array[i] = i + 1;
//cout << "i = " << i << " is " << array[i]<<endl;
}
launchMyKernel(array, N);
free(array);
return 0;
}
kernel_header.cuh
#ifndef KERNELHEADER
#define KERNELHEADER
void launchMyKernel(int* array, int arrayCount);
#endif
kernel.cu
#include "stdio.h"
#include "cuda_runtime.h"
__global__ void MyKernel(int* array, int arrayCount)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < arrayCount)
{
array[idx] *= array[idx];
}
}
void launchMyKernel(int* array, int arrayCount)
{
int blockSize; // The launch configurator returned block size
int minGridSize; // The minimum grid size needed to achieve the
// maximum occupancy for a full device launch
int gridSize; // The actual grid size needed, based on input size
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize,MyKernel, 0, 0);
// Round up according to array size
gridSize = (arrayCount + blockSize - 1) / blockSize;
MyKernel << < gridSize, blockSize >> > (array, arrayCount);
cudaDeviceSynchronize();
// calculate theoretical occupancy
int maxActiveBlocks;
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&maxActiveBlocks,
MyKernel, blockSize,
0);
int device;
cudaDeviceProp props;
cudaGetDevice(&device);
cudaGetDeviceProperties(&props, device);
float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
(float)(props.maxThreadsPerMultiProcessor /
props.warpSize);
printf("\n\nMax. Active blocks found: %d\nOur Kernel block size decided: %d\nWarp Size: %d\nNumber of threads per SM: %d\n\n\n\n", maxActiveBlocks
, blockSize,
props.warpSize,
props.maxThreadsPerMultiProcessor);
printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
blockSize, occupancy);
}
【问题讨论】: