CUDA-流 - 爱码网

页锁定内存：

cudaHostAlloc()分配页锁定内存,页锁定内存也称为固定内存或不可分页内存，它有一个重要的属性：操作系统将不会对这块内存分页并交换到磁盘上，从而确保了该内存始终驻留在物理内存中。

流：

cuda流用于任务的并行。任务并行性是指并行执行两个或多个不同的任务，而不是在大量数据上执行同一个任务的数据并行性。比如处理同一副图，你用一个流处理左边半张图片，再用第二个流处理右边半张图片，这两个流中的代码同时执行，加快了处理速度。

示例：

#include <stdio.h>
#include <cuda_runtime.h>

#define N (1024*1024)
#define DATA_SIZE (N*20)

__global__ void add(int *a,int *b,int *c){
 int idx=threadIdx.x+blockIdx.x*blockDim.x;
 if(idx<N){
  int idx1=(idx+1)%256;
  int idx2=(idx+2)%256;
  float as=(a[idx]+a[idx1]+a[idx2])/3.0f;
  float bs=(b[idx]+b[idx1]+b[idx2])/3.0f;
  c[idx]=(as+bs)/2;
 }
}

int main(){

 cudaDeviceProp prop;
 int whichDevice;
 cudaGetDevice(&whichDevice);
 cudaGetDeviceProperties(&prop,whichDevice);
 if(!prop.deviceOverlap){
   printf("Device not overlap....");
   return 0;
 }

 int *a,*b,*c;
 int *a1,*b1,*c1;
 int *a_host,*b_host,*c_host;
 cudaEvent_t start,end;
 float elapsedTime;
 cudaEventCreate(&start);
 cudaEventCreate(&end);
 cudaEventRecord(start,0);

 cudaStream_t stream0,stream1;
 cudaStreamCreate(&stream0);
cudaStreamCreate(&stream1);

 cudaMalloc((void **)&a,N*sizeof(int));
 cudaMalloc((void **)&b,N*sizeof(int));
 cudaMalloc((void **)&c,N*sizeof(int));

 cudaMalloc((void **)&a1,N*sizeof(int));
 cudaMalloc((void **)&b1,N*sizeof(int));
 cudaMalloc((void **)&c1,N*sizeof(int));


 cudaHostAlloc((void **)&a_host,DATA_SIZE*sizeof(int),cudaHostAllocDefault);
 cudaHostAlloc((void **)&b_host,DATA_SIZE*sizeof(int),cudaHostAllocDefault);
 cudaHostAlloc((void **)&c_host,DATA_SIZE*sizeof(int),cudaHostAllocDefault);

 for(int i=0;i<DATA_SIZE;i++){
  a_host[i]=i;
  b_host[i]=i;
 }

 for(int i=0;i<DATA_SIZE;i+=N*2){
  cudaMemcpyAsync(a,a_host+i,N*sizeof(int),cudaMemcpyHostToDevice,stream0);
  cudaMemcpyAsync(a1,a_host+i+N,N*sizeof(int),cudaMemcpyHostToDevice,stream1);


  cudaMemcpyAsync(b,b_host+i,N*sizeof(int),cudaMemcpyHostToDevice,stream0);
  cudaMemcpyAsync(b1,b_host+i+N,N*sizeof(int),cudaMemcpyHostToDevice,stream1);

  add<<<N/256,256,0,stream0>>>(a,b,c);
  add<<<N/256,256,0,stream1>>>(a1,b1,c1);

  cudaMemcpyAsync(c_host+i,c,N*sizeof(int),cudaMemcpyDeviceToHost,stream0);
  cudaMemcpyAsync(c_host+i+N,c,N*sizeof(int),cudaMemcpyDeviceToHost,stream1);

 }

 cudaStreamSynchronize(stream0);
 cudaStreamSynchronize(stream1);
 cudaEventRecord(end,0);
 cudaEventSynchronize(end);

 cudaEventElapsedTime(&elapsedTime,start,end);
 printf("tie===%3.1f ms\n",elapsedTime);

 cudaFreeHost(a_host);
 cudaFreeHost(b_host);
 cudaFreeHost(c_host);
 cudaFree(a);
 cudaFree(b);
 cudaFree(c);
 cudaFree(a1);
 cudaFree(b1);
 cudaFree(c1);

 cudaStreamDestroy(stream0);
 cudaStreamDestroy(stream1);

return 0;
}

注：硬件在处理内存复制和核函数执行时分别采用了不同的引擎，因此我们需要知道，将操作放入流中队列中的顺序将影响着CUDA驱动程序调度这些操作以及执行的方式。因此在将操作放入流的队列时应该采用宽度优先方式，而非深度优先方式。

CUDA-流

参考：《GPU高性能编程CUDA实战》