【问题标题】:Recipe to copy 1D strided data with cudaMemcpy2D使用 cudaMemcpy2D 复制 1D 跨步数据的方法
【发布时间】:2019-10-29 00:57:13
【问题描述】:

如果一个设备内存有两个连续范围,则可以使用 cudaMemcpy 将内存从一个范围复制到另一个范围。

   double* source = ...
   double* dest = ...
   cudaMemcpy(dest, source, N, cudaMemcpyDeviceToDevice);

现在假设我想将 source 复制到 dest,但分别是每 2 或 3 个元素。 那是dest[0] = source[0], dest[3] = source[2], dest[6] = source[4], ...。 当然,一个普通的cudaMemcpy 不能做到这一点。

直观地说,cudaMemcpy2D 应该能够完成这项工作,因为“跨步元素可以看作是更大数组中的一列”。 但是cudaMemcpy2D 它有很多输入参数在这种情况下难以解释,例如pitch

例如,我经理使用cudaMemcpy2D 来重现两个步幅都是 1 的情况。

    cudaMemcpy2D(dest, 1, source, 1, 1, n*sizeof(T), cudaMemcpyDeviceToHost);

但我无法弄清楚一般情况,dest_stridesource_stride 与 1 不同。

有没有办法用cudaMemcpy2D将跨步数据复制到跨步数据? 我必须按照哪个顺序放置有关布局的已知信息?即,按照两个步幅和sizeof(T)

    cudaMemcpy2D(dest, ??, source, ???, ????, ????, cudaMemcpyDeviceToHost);

【问题讨论】:

    标签: arrays multidimensional-array cuda stride


    【解决方案1】:

    是的,这可以做到。用代码比文字更容易说明:

    #include <iostream>
    
    int main()
    {
        const size_t swidth = 2;
        const size_t sheight = 4;
        size_t spitch = swidth * sizeof(int);
        int source[swidth * sheight] = { 0, 1, 2, 3, 4, 5, 6, 7 }; 
    
    
        const size_t dwidth = 3;
        const size_t dheight = 4;
        size_t dpitch = dwidth * sizeof(int);
        int dest[dwidth * dheight] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
    
        const size_t cwidth = 1 * sizeof(int);
        const size_t cheight = 3;
    
        int* source_d; cudaMalloc(&source_d, spitch * sheight);
        cudaMemcpy(source_d, &source[0], spitch * sheight, cudaMemcpyHostToDevice);
        cudaMemcpy2D(&dest[0], dpitch, source_d, spitch, cwidth, cheight, cudaMemcpyDeviceToHost);
    
        for(int i=0; i < 12; i++) std::cout << i << " " << dest[i] << std::endl;
    
        return 0;
    }
    

    这是做什么的:

    $ nvcc -std=c++11 -arch=sm_52 -o strided_copy strided_copy.cu 
    $ cuda-memcheck ./strided_copy
    ========= CUDA-MEMCHECK
    0 0
    1 -1
    2 -1
    3 2
    4 -1
    5 -1
    6 4
    7 -1
    8 -1
    9 -1
    10 -1
    11 -1
    ========= ERROR SUMMARY: 0 errors
    

    本质上,您正在将一个宽度为 4 字节(一个 int)、跨度为 8 个字节(两个 int)的目标复制到一个跨度为 12 个字节(三个 int)的目标中。我只复制了三个 rwo,以便清楚地看到 row 参数是如何工作的。调整复制元素的大小和步幅等。

    【讨论】:

      【解决方案2】:

      此类跨步复制的通用函数大致如下所示:

      void cudaMemcpyStrided(
              void *dst, int dstStride, 
              void *src, int srcStride, 
              int numElements, int elementSize, int kind) {
          int srcPitchInBytes = srcStride * elementSize;
          int dstPitchInBytes = dstStride * elementSize;
          int width = 1 * elementSize;
          int height = numElements;
          cudaMemcpy2D(
              dst, dstPitchInBytes, 
              src, srcPitchInBytes, 
              width, height,
              kind);
      }
      

      对于你的例子,它可以被称为

      cudaMemcpyStrided(dest, 3, source, 2, 3, sizeof(double), cudaMemcpyDeviceToDevice);
      

      “大致”,因为我只是从我测试它的(基于 Java/JCuda 的)代码中即时翻译它:

      import static jcuda.runtime.JCuda.cudaMemcpy2D;
      
      import java.util.Arrays;
      import java.util.Locale;
      
      import jcuda.Pointer;
      import jcuda.Sizeof;
      import jcuda.runtime.cudaMemcpyKind;
      
      public class JCudaStridedMemcopy {
          public static void main(String[] args) {
      
              int dstLength = 9;
              int srcLength = 6;
              int dstStride = 3;
              int srcStride = 2;
              int numElements = 3;
              runExample(dstLength, dstStride, srcLength, srcStride, numElements);
      
              dstLength = 9;
              srcLength = 12;
              dstStride = 3;
              srcStride = 4;
              numElements = 3;
              runExample(dstLength, dstStride, srcLength, srcStride, numElements);
      
              dstLength = 18;
              srcLength = 12;
              dstStride = 3;
              srcStride = 2;
              numElements = 6;
              runExample(dstLength, dstStride, srcLength, srcStride, numElements);
      
          }
      
          private static void runExample(int dstLength, int dstStride, int srcLength, int srcStride, int numElements) {
              double dst[] = new double[dstLength];
              double src[] = new double[srcLength];
              for (int i = 0; i < src.length; i++) {
                  src[i] = i;
              }
      
              cudaMemcpyStrided(dst, dstStride, src, srcStride, numElements);
      
              System.out.println("Copy " + numElements + " elements");
              System.out.println("  to   array with length " + dstLength + ", with a stride of " + dstStride);
              System.out.println("  from array with length " + srcLength + ", with a stride of " + srcStride);
      
              System.out.println("");
      
              System.out.println("Destination:");
              System.out.println(toString2D(dst, dstStride));
              System.out.println("Flat: " + Arrays.toString(dst));
      
              System.out.println("");
      
              System.out.println("Source:");
              System.out.println(toString2D(src, srcStride));
              System.out.println("Flat: " + Arrays.toString(src));
      
              System.out.println("");
              System.out.println("Done");
              System.out.println("");
      
          }
      
          private static void cudaMemcpyStrided(double dst[], int dstStride, double src[], int srcStride, int numElements) {
              long srcPitchInBytes = srcStride * Sizeof.DOUBLE;
              long dstPitchInBytes = dstStride * Sizeof.DOUBLE;
              long width = 1 * Sizeof.DOUBLE;
              long height = numElements;
              cudaMemcpy2D(
                      Pointer.to(dst), dstPitchInBytes, 
                      Pointer.to(src), srcPitchInBytes, 
                      width, height,
                      cudaMemcpyKind.cudaMemcpyHostToHost);
          }
      
          public static String toString2D(double[] a, long columns) {
              String format = "%4.1f ";
              ;
              StringBuilder sb = new StringBuilder();
              for (int i = 0; i < a.length; i++) {
                  if (i > 0 && i % columns == 0) {
                      sb.append("\n");
                  }
                  sb.append(String.format(Locale.ENGLISH, format, a[i]));
              }
              return sb.toString();
          }
      }
      

      为了了解函数的作用,根据示例/测试用例,以下是输出:

      Copy 3 elements
        to   array with length 9, with a stride of 3
        from array with length 6, with a stride of 2
      
      Destination:
       0.0  0.0  0.0 
       2.0  0.0  0.0 
       4.0  0.0  0.0 
      Flat: [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 4.0, 0.0, 0.0]
      
      Source:
       0.0  1.0 
       2.0  3.0 
       4.0  5.0 
      Flat: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
      
      Done
      
      Copy 3 elements
        to   array with length 9, with a stride of 3
        from array with length 12, with a stride of 4
      
      Destination:
       0.0  0.0  0.0 
       4.0  0.0  0.0 
       8.0  0.0  0.0 
      Flat: [0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 8.0, 0.0, 0.0]
      
      Source:
       0.0  1.0  2.0  3.0 
       4.0  5.0  6.0  7.0 
       8.0  9.0 10.0 11.0 
      Flat: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]
      
      Done
      
      Copy 6 elements
        to   array with length 18, with a stride of 3
        from array with length 12, with a stride of 2
      
      Destination:
       0.0  0.0  0.0 
       2.0  0.0  0.0 
       4.0  0.0  0.0 
       6.0  0.0  0.0 
       8.0  0.0  0.0 
      10.0  0.0  0.0 
      Flat: [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 4.0, 0.0, 0.0, 6.0, 0.0, 0.0, 8.0, 0.0, 0.0, 10.0, 0.0, 0.0]
      
      Source:
       0.0  1.0 
       2.0  3.0 
       4.0  5.0 
       6.0  7.0 
       8.0  9.0 
      10.0 11.0 
      Flat: [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]
      
      Done
      

      【讨论】:

      • 我猜反对票是因为里面有一些 Java 代码?将其视为附加组件。就此而言,相关函数在 C 中。
      • 遗憾的是,cudaMemcpy2D 不接受 0(零)作为 srcPitchInBytes 参数(返回错误),这将允许更理智的 memset 函数。
      猜你喜欢
      • 2013-02-05
      • 2019-02-21
      • 1970-01-01
      • 1970-01-01
      • 2011-09-23
      • 1970-01-01
      • 2016-10-22
      • 2013-04-08
      • 1970-01-01
      相关资源
      最近更新 更多