MPI：发送和接收动态分配的子矩阵答案

【问题标题】：MPI : Sending and receiving dynamically allocated sub-matrixMPI：发送和接收动态分配的子矩阵
【发布时间】：2021-06-13 04:21:15
【问题描述】：

我在向工作人员发送动态分配的子矩阵时遇到问题。我无法理解如何正确地做到这一点（以及我应该发送什么）。

这里是发送部分：

MPI_Send(&(a[offset][0]), rows * NCA, MPI_DOUBLE, dest, FROM_MASTER + 2, MPI_COMM_WORLD);
MPI_Send(&b, NCA * NCB, MPI_DOUBLE, dest, FROM_MASTER + 3, MPI_COMM_WORLD);

这里是接收部分：

MPI_Recv(&(a[0][0]), rows * NCA, MPI_DOUBLE, MASTER, FROM_MASTER + 2, MPI_COMM_WORLD, &status);
MPI_Recv(&(b[0][0]), NCA * NCB, MPI_DOUBLE, MASTER, FROM_MASTER + 3, MPI_COMM_WORLD, &status);

这里是错误信息：

[pop-os:29368] Read -1, expected 80000, errno = 14
[pop-os:29367] *** Process received signal ***
[pop-os:29367] Signal: Segmentation fault (11)
[pop-os:29367] Signal code: Address not mapped (1)
[pop-os:29367] Failing at address: 0x7fffc2ae8000

这里是所有代码：

#include <cstdio>
#include <cstdlib>

#include "mpi.h"
#define NRA 100
/* number of rows in matrix A */
#define NCA 100
/* number of columns in matrix A */
#define NCB 100
/* number of columns in matrix B */
#define MASTER 0
/* taskid of first task */
#define FROM_MASTER 1  /* setting a message type */
#define FROM_WORKER 10 /* setting a message type */

double **alloc_2d_int(int rows, int cols) {
  double **array= (double **)malloc(rows*sizeof(double*));
  for (int i=0; i<rows; i++)
    array[i] = (double *)malloc(rows*cols*sizeof(double));
  return array;
}


int main(int argc, char *argv[]) {
  int numtasks, taskid, numworkers, source, dest, rows,
      /* rows of matrix A sent to each worker */
      averow, extra, offset, i, j, k;

  double **a = alloc_2d_int(NRA, NCA);
  double **b = alloc_2d_int(NCA, NCB);
  double **c = alloc_2d_int(NRA, NCB);

  MPI_Init(&argc, &argv);

  MPI_Status status;

  MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
  MPI_Comm_rank(MPI_COMM_WORLD, &taskid);

  if (numtasks < 2) {
    printf("Need at least two MPI tasks. Quitting...\n");
    MPI_Abort(MPI_COMM_WORLD, -1);
    exit(1);
  }

  numworkers = numtasks - 1;

  if (taskid == MASTER) {
    printf("mpi_mm has started with %d tasks (task1).\n", numtasks);

    for (i = 0; i < NRA; i++)
      for (j = 0; j < NCA; j++) a[i][j] = 10;
    for (i = 0; i < NCA; i++)
      for (j = 0; j < NCB; j++) b[i][j] = 10;

    double t1 = MPI_Wtime();

    averow = NRA / numworkers;
    extra = NRA % numworkers;
    offset = 0;

    for (dest = 1; dest <= numworkers; dest++) {
      rows = (dest <= extra) ? averow + 1 : averow;
      printf("Sending %d rows to task %d offset=%d\n", rows, dest, offset);

      MPI_Send(&offset, 1, MPI_INT, dest, FROM_MASTER, MPI_COMM_WORLD);
      MPI_Send(&rows, 1, MPI_INT, dest, FROM_MASTER + 1, MPI_COMM_WORLD);
      MPI_Send(&(a[offset][0]), rows * NCA, MPI_DOUBLE, dest, FROM_MASTER + 2,
               MPI_COMM_WORLD);
      MPI_Send(&b, NCA * NCB, MPI_DOUBLE, dest, FROM_MASTER + 3,
               MPI_COMM_WORLD);

      offset = offset + rows;
    }

    /* Receive results from worker tasks */
    for (source = 1; source <= numworkers; source++) {
      MPI_Recv(&offset, 1, MPI_INT, source, FROM_WORKER, MPI_COMM_WORLD,
               &status);
      MPI_Recv(&rows, 1, MPI_INT, source, FROM_WORKER + 1, MPI_COMM_WORLD,
               &status);
      MPI_Recv(&(c[offset][0]), rows * NCB, MPI_DOUBLE, source, FROM_WORKER + 2,
               MPI_COMM_WORLD, &status);

      printf("Received results from task %d\n", source);
    }

    /* Print results */
    /*
    printf("****\n");
    printf("Result Matrix:\n");

    for (i = 0; i < NRA; i++)
     {
      printf("\n");
      for (j = 0; j < NCB; j++) printf("%6.2f ", c[i][j]);
    }*/

    printf("\n********\n");
    printf("Done.\n");

    t1 = MPI_Wtime() - t1;

    printf("\nExecution time: %.2f\n", t1);
  }
  /******** worker task *****************/
  else { /* if (taskid > MASTER) */
    MPI_Recv(&offset, 1, MPI_INT, MASTER, FROM_MASTER, MPI_COMM_WORLD, &status);
    MPI_Recv(&rows, 1, MPI_INT, MASTER, FROM_MASTER + 1, MPI_COMM_WORLD,
             &status);
    MPI_Recv(&(a[0][0]), rows * NCA, MPI_DOUBLE, MASTER, FROM_MASTER + 2,
             MPI_COMM_WORLD, &status);
    MPI_Recv(&(b[0][0]), NCA * NCB, MPI_DOUBLE, MASTER, FROM_MASTER + 3, MPI_COMM_WORLD,
             &status);

    for (k = 0; k < NCB; k++)
      for (i = 0; i < rows; i++) {
        c[i][k] = 0.0;
        for (j = 0; j < NCA; j++) c[i][k] = c[i][k] + a[i][j] * b[j][k];
      }

    MPI_Send(&offset, 1, MPI_INT, MASTER, FROM_WORKER, MPI_COMM_WORLD);
    MPI_Send(&rows, 1, MPI_INT, MASTER, FROM_WORKER + 1, MPI_COMM_WORLD);
    MPI_Send(&c, rows * NCB, MPI_DOUBLE, MASTER, FROM_WORKER + 2,
             MPI_COMM_WORLD);
  }

  for (i=0; i<NRA; i++)
    free(a[i]);
  free(a);

  for (i=0; i<NCA; i++)
    free(b[i]);
  free(b);

  for (i=0; i<NRA; i++)
    free(c[i]);
  free(c);

  MPI_Finalize();
}

解决方案：用正确的代码链接to GitHub

【问题讨论】：

我用解决方案的链接更新了问题。谢谢！！ @dreamcrash
不客气，很好，我看到你保留了 2D 语义并在内存中连续分配

标签： c performance parallel-processing mpi matrix-multiplication

【解决方案1】：

MPI_Send和MPI_recv第一个参数是const void *所以需要改一下：

 MPI_Send(&b, NCA * NCB, MPI_DOUBLE, dest, FROM_MASTER + 3, MPI_COMM_WORLD);

到

MPI_Send(b, NCA * NCB, MPI_DOUBLE, dest, FROM_MASTER + 3, MPI_COMM_WORLD);

和

MPI_Send(&c, rows * NCB, MPI_DOUBLE, MASTER, FROM_WORKER + 2, MPI_COMM_WORLD);

到

 MPI_Send(c, rows * NCB, MPI_DOUBLE, MASTER, FROM_WORKER + 2, MPI_COMM_WORLD);

您遇到的另一个问题是您正在分配一个指针数组：

double **alloc_2d_int(int rows, int cols) {
  double **array= (double **)malloc(rows*sizeof(double*));
  for (int i=0; i<rows; i++)
    array[i] = (double *)malloc(rows*cols*sizeof(double));
  return array;
}

但是MPI_Send 和MPI_Recv 上要发送/接收的数据假定是连续的。为了解决这个问题，您可以创建一个continuously 2D array，将矩阵简单地表示为一个数组，创建一个 MPI 自定义类型等等。

【讨论】：