前提(可以看了下面回來再看):
1. 矩陣A是一個n*n方陣
2. 有p個處理器,每個處理器得到n/sqrt(p)個數據,(注意:此處好像要求n是一個平方數。因爲cannon要求每個分塊大小一樣)
3. 一個高效的串行矩陣乘法算法(dgemm,sgemm),用於計算塊與塊之間相乘
看下面鏈接
https://blog.csdn.net/u013720726/article/details/70667697
關於對齊再補充一點方便理解:
...對齊的目的就是對齊,不對齊就沒法算
實現見下,缺陷有:
1. 沒有實現讀入數據和數據的分發
2. 沒有實現數據的收集和整合(或者好看的打印)
3. 使用的串行矩陣計算代碼效率較低
注意:processor數目要求是平方數,n要求是可以被sqrt(processor數量)乘除的數
本代碼用課程ppt上的代碼改的,需要的隨便用
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "mpi.h"
#define DEBUG 1
void MatrixMultiplyAgg(int n, double *a, double *b, double *c);
main(int argc, char *argv[])
{
int i, j, k, m, p;
int n, nlocal;
double *a, *b, *c;
int npes, dims[2], periods[2];
int myrank, my2drank, mycoords[2];
int shiftsource, shiftdest;
int rightrank, leftrank, downrank, uprank;
MPI_Status status;
MPI_Comm comm_2d;
// 進入並行代碼
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &npes);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
if (myrank == 0)
printf("%d processors\n", npes);
if (argc != 2)
{
if (myrank == 0)
printf("Usage: %s <the dimension of the matrix>\n", argv[0]);
MPI_Finalize();
exit(0);
}
// 小塊的行列等於sqrt(處理器數目),處理器數目要求是平方數
dims[0] = sqrt(npes);
dims[1] = npes / dims[0];
if (dims[0] != dims[1])
{
if (myrank == 0)
printf("The number of processes must be perfect square.\n");
MPI_Finalize();
exit(0);
}
// logical array of size ndims specifying whether the grid is
// periodic (true) or not (false) in each dimension
periods[0] = periods[1] = 1;
MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &comm_2d);
MPI_Comm_rank(comm_2d, &my2drank);
MPI_Cart_coords(comm_2d, my2drank, 2, mycoords);
n = atoi(argv[1]); // n 矩陣行列長度
nlocal = n / dims[0]; // nlocal 分塊行列長度
a = (double*)malloc(nlocal*nlocal * sizeof(double));
b = (double *)malloc(nlocal*nlocal * sizeof(double));
c = (double *)malloc(nlocal*nlocal * sizeof(double));
if (DEBUG)
printf("%d: init matrix\n", myrank);
// 初始化矩陣內容
for (i = 0; i < nlocal*nlocal; i++) {
a[i] = myrank;
b[i] = myrank;
c[i] = 0.0;
}
if (DEBUG)
printf("%d: done initing matrix\n", myrank);
MPI_Barrier(MPI_COMM_WORLD);
// 對齊
if (DEBUG) {
printf("%d: x:%d,y:%d\n", myrank, mycoords[0], mycoords[1]);
}
MPI_Cart_shift(comm_2d, 0, -mycoords[1], &shiftsource, &shiftdest);
if (DEBUG) {
printf("%d: dest:%d,source:%d\n", myrank, shiftdest, shiftsource);
}
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE, shiftdest, 1, shiftsource, 1, comm_2d, &status);
MPI_Barrier(comm_2d);
MPI_Cart_shift(comm_2d, 1, -mycoords[0], &shiftsource, &shiftdest);
if (DEBUG) {
printf("%d: dest:%d,source:%d\n", myrank, shiftdest, shiftsource);
}
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE, shiftdest, 1, shiftsource, 1, comm_2d, &status);
if (DEBUG)
{
printf("%d: ready to start calculating\n", myrank);
}
MPI_Barrier(comm_2d);
// 計算
MPI_Cart_shift(comm_2d, 0, -1, &rightrank, &leftrank);
MPI_Cart_shift(comm_2d, 1, -1, &downrank, &uprank);
if (DEBUG)
{
printf("%d: right:%d, left:%d, up:%d, down:%d\n",
myrank, rightrank, leftrank, uprank, downrank);
}
for (i = 0; i < dims[0]; i++)
{
MPI_Barrier(comm_2d);
MatrixMultiplyAgg(nlocal, a, b, c);
MPI_Sendrecv_replace(a, nlocal*nlocal,
MPI_DOUBLE, leftrank, 1, rightrank, 1, comm_2d, &status);
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE, uprank, 1, downrank, 1, comm_2d, &status);
}
MPI_Barrier(comm_2d);
// 從對齊後狀態復原
MPI_Cart_shift(comm_2d, 0, -mycoords[1], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE, shiftdest, 1, shiftsource, 1, comm_2d, &status);
MPI_Barrier(comm_2d);
MPI_Cart_shift(comm_2d, 1, -mycoords[0], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE, shiftdest, 1, shiftsource, 1, comm_2d, &status);
MPI_Barrier(comm_2d);
MPI_Comm_free(&comm_2d);
if (DEBUG)
printf("%d: finish calculating\n",myrank);
MPI_Barrier(MPI_COMM_WORLD);
int rank = 0;
while (rank < npes) {
if (myrank == rank) {
printf("my rank: %d\n", myrank);
//printf("x:%d,y:%d\n", mycoords[0], mycoords[1]);
puts("Random Matrix A");
for (i = 0; i < nlocal; i++)
{
for (j = 0; j < nlocal; j++)
printf("%6.3f ", a[i*nlocal + j]);
printf("\n");
}
puts("Random Matrix B");
for (i = 0; i < nlocal; i++)
{
for (j = 0; j < nlocal; j++)
printf("%6.3f ", b[i*nlocal + j]);
printf("\n");
}
puts("Matrix C = A*B");
for (i = 0; i < nlocal; i++)
{
for (j = 0; j < nlocal; j++)
printf("%6.3f ", c[i*nlocal + j]);
printf("\n");
}
free(a);
free(b);
free(c);
}
rank++;
MPI_Barrier(MPI_COMM_WORLD);
}
MPI_Finalize();
return 0;
}
void MatrixMultiplyAgg(int n, double *a, double *b, double *c)
{
int i, j, k;
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
for (k = 0; k < n; k++)
c[i*n + j] += a[i*n + k] * b[k*n + j];
}
z@ubuntu:~/code/mpi$ mpiexec -n 4 ./cannon 4
4 processors
0: init matrix
1: init matrix
1: done initing matrix
2: init matrix
2: done initing matrix
3: init matrix
3: done initing matrix
0: done initing matrix
0: x:0,y:0
1: x:0,y:1
1: dest:3,source:3
2: x:1,y:0
2: dest:2,source:2
0: dest:0,source:0
3: x:1,y:1
3: dest:1,source:1
0: dest:0,source:0
1: dest:1,source:1
1: ready to start calculating
2: dest:3,source:3
0: ready to start calculating
2: ready to start calculating
3: dest:2,source:2
3: ready to start calculating
0: right:2, left:2, up:1, down:1
1: right:3, left:3, up:0, down:0
2: right:0, left:0, up:3, down:3
3: right:1, left:1, up:2, down:2
0: finish calculating
1: finish calculating
2: finish calculating
3: finish calculating
my rank: 0
Random Matrix A
0.000 0.000
0.000 0.000
Random Matrix B
0.000 0.000
0.000 0.000
Matrix C = A*B
4.000 4.000
4.000 4.000
my rank: 1
Random Matrix A
1.000 1.000
1.000 1.000
Random Matrix B
1.000 1.000
1.000 1.000
Matrix C = A*B
6.000 6.000
6.000 6.000
my rank: 2
Random Matrix A
2.000 2.000
2.000 2.000
Random Matrix B
2.000 2.000
2.000 2.000
Matrix C = A*B
12.000 12.000
12.000 12.000
my rank: 3
Random Matrix A
3.000 3.000
3.000 3.000
Random Matrix B
3.000 3.000
3.000 3.000
Matrix C = A*B
22.000 22.000
22.000 22.000