cannon算法的原理及MPI C語言實現

前提（可以看了下面回來再看）：

1. 矩陣A是一個n*n方陣

2. 有p個處理器，每個處理器得到n/sqrt(p)個數據，（注意：此處好像要求n是一個平方數。因爲cannon要求每個分塊大小一樣）

3. 一個高效的串行矩陣乘法算法（dgemm，sgemm），用於計算塊與塊之間相乘

看下面鏈接

https://blog.csdn.net/u013720726/article/details/70667697

關於對齊再補充一點方便理解：

...對齊的目的就是對齊，不對齊就沒法算

實現見下，缺陷有：

1. 沒有實現讀入數據和數據的分發

2. 沒有實現數據的收集和整合（或者好看的打印）

3. 使用的串行矩陣計算代碼效率較低

注意：processor數目要求是平方數，n要求是可以被sqrt（processor數量）乘除的數

本代碼用課程ppt上的代碼改的，需要的隨便用

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "mpi.h"

#define DEBUG 1
void MatrixMultiplyAgg(int n, double *a, double *b, double *c);

main(int argc, char *argv[])
{
	int i, j, k, m, p;
	int n, nlocal;
	double *a, *b, *c;
	int npes, dims[2], periods[2];
	int myrank, my2drank, mycoords[2];
	int shiftsource, shiftdest;
	int rightrank, leftrank, downrank, uprank;
	MPI_Status status;
	MPI_Comm comm_2d;

	// 進入並行代碼
	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &npes);
	MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

	if (myrank == 0)
		printf("%d processors\n", npes);

	if (argc != 2)
	{
		if (myrank == 0)
			printf("Usage: %s <the dimension of  the matrix>\n", argv[0]);
		MPI_Finalize();
		exit(0);
	}

	// 小塊的行列等於sqrt（處理器數目），處理器數目要求是平方數
	dims[0] = sqrt(npes); 
	dims[1] = npes / dims[0];
	if (dims[0] != dims[1])
	{
		if (myrank == 0)
			printf("The number of processes must be perfect square.\n");
		MPI_Finalize();
		exit(0);
	}

	// logical array of size ndims specifying whether the grid is 
	// periodic (true) or not (false) in each dimension
	periods[0] = periods[1] = 1;

	MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &comm_2d);
	MPI_Comm_rank(comm_2d, &my2drank);
	MPI_Cart_coords(comm_2d, my2drank, 2, mycoords);

	n = atoi(argv[1]); // n 矩陣行列長度
	nlocal = n / dims[0]; // nlocal 分塊行列長度
	a = (double*)malloc(nlocal*nlocal * sizeof(double));
	b = (double *)malloc(nlocal*nlocal * sizeof(double));
	c = (double *)malloc(nlocal*nlocal * sizeof(double));

	if (DEBUG)
		printf("%d: init matrix\n", myrank);

	// 初始化矩陣內容
	for (i = 0; i < nlocal*nlocal; i++) {
		a[i] = myrank;
		b[i] = myrank;
		c[i] = 0.0;
	}

	if (DEBUG)
		printf("%d: done initing matrix\n", myrank);

	MPI_Barrier(MPI_COMM_WORLD);

	// 對齊
	if (DEBUG) {
		printf("%d: x:%d,y:%d\n", myrank, mycoords[0], mycoords[1]);
	}

	MPI_Cart_shift(comm_2d, 0, -mycoords[1], &shiftsource, &shiftdest);
	if (DEBUG) {
		printf("%d: dest:%d,source:%d\n", myrank, shiftdest, shiftsource);
	}
	MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE, shiftdest, 1, shiftsource, 1, comm_2d, &status);

	MPI_Barrier(comm_2d);

	MPI_Cart_shift(comm_2d, 1, -mycoords[0], &shiftsource, &shiftdest);
	if (DEBUG) {
		printf("%d: dest:%d,source:%d\n", myrank, shiftdest, shiftsource);
	}
	MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE, shiftdest, 1, shiftsource, 1, comm_2d, &status);

	if (DEBUG)
	{
		printf("%d: ready to start calculating\n", myrank);
	}
		

	MPI_Barrier(comm_2d);

	// 計算
	MPI_Cart_shift(comm_2d, 0, -1, &rightrank, &leftrank);
	MPI_Cart_shift(comm_2d, 1, -1, &downrank, &uprank);

	if (DEBUG)
	{
		printf("%d: right:%d, left:%d, up:%d, down:%d\n", 
			myrank, rightrank, leftrank, uprank, downrank);
	}

	for (i = 0; i < dims[0]; i++)
	{
		MPI_Barrier(comm_2d);
		MatrixMultiplyAgg(nlocal, a, b, c);
		MPI_Sendrecv_replace(a, nlocal*nlocal,
			MPI_DOUBLE, leftrank, 1, rightrank, 1, comm_2d, &status);
		MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE, uprank, 1, downrank, 1, comm_2d, &status);
	}

	MPI_Barrier(comm_2d);

	// 從對齊後狀態復原
	MPI_Cart_shift(comm_2d, 0, -mycoords[1], &shiftsource, &shiftdest);
	MPI_Sendrecv_replace(a, nlocal*nlocal, MPI_DOUBLE, shiftdest, 1, shiftsource, 1, comm_2d, &status);

	MPI_Barrier(comm_2d);

	MPI_Cart_shift(comm_2d, 1, -mycoords[0], &shiftsource, &shiftdest);
	MPI_Sendrecv_replace(b, nlocal*nlocal, MPI_DOUBLE, shiftdest, 1, shiftsource, 1, comm_2d, &status);

	MPI_Barrier(comm_2d);

	MPI_Comm_free(&comm_2d);

	if (DEBUG)
		printf("%d: finish calculating\n",myrank);

	MPI_Barrier(MPI_COMM_WORLD);

	int rank = 0;
	while (rank < npes) {
		if (myrank == rank) {
			printf("my rank: %d\n", myrank);
			//printf("x:%d,y:%d\n", mycoords[0], mycoords[1]);
			puts("Random Matrix A");
			for (i = 0; i < nlocal; i++)
			{
				for (j = 0; j < nlocal; j++)
					printf("%6.3f ", a[i*nlocal + j]);
				printf("\n");
			}
			puts("Random Matrix B");
			for (i = 0; i < nlocal; i++)
			{
				for (j = 0; j < nlocal; j++)
					printf("%6.3f ", b[i*nlocal + j]);
				printf("\n");
			}
			puts("Matrix C = A*B");
			for (i = 0; i < nlocal; i++)
			{
				for (j = 0; j < nlocal; j++)
					printf("%6.3f ", c[i*nlocal + j]);
				printf("\n");
			}
			free(a);
			free(b);
			free(c);
		}
		rank++;
		MPI_Barrier(MPI_COMM_WORLD);
	}

	MPI_Finalize();
	return 0;
}

void MatrixMultiplyAgg(int n, double *a, double *b, double *c)
{
	int i, j, k;
	for (i = 0; i < n; i++)
		for (j = 0; j < n; j++)
			for (k = 0; k < n; k++)
				c[i*n + j] += a[i*n + k] * b[k*n + j];
}

z@ubuntu:~/code/mpi$ mpiexec -n 4 ./cannon 4
4 processors
0: init matrix
1: init matrix
1: done initing matrix
2: init matrix
2: done initing matrix
3: init matrix
3: done initing matrix
0: done initing matrix
0: x:0,y:0
1: x:0,y:1
1: dest:3,source:3
2: x:1,y:0
2: dest:2,source:2
0: dest:0,source:0
3: x:1,y:1
3: dest:1,source:1
0: dest:0,source:0
1: dest:1,source:1
1: ready to start calculating
2: dest:3,source:3
0: ready to start calculating
2: ready to start calculating
3: dest:2,source:2
3: ready to start calculating
0: right:2, left:2, up:1, down:1
1: right:3, left:3, up:0, down:0
2: right:0, left:0, up:3, down:3
3: right:1, left:1, up:2, down:2
0: finish calculating
1: finish calculating
2: finish calculating
3: finish calculating
my rank: 0
Random Matrix A
0.000 0.000
0.000 0.000
Random Matrix B
0.000 0.000
0.000 0.000
Matrix C = A*B
4.000 4.000
4.000 4.000
my rank: 1
Random Matrix A
1.000 1.000
1.000 1.000
Random Matrix B
1.000 1.000
1.000 1.000
Matrix C = A*B
6.000 6.000
6.000 6.000
my rank: 2
Random Matrix A
2.000 2.000
2.000 2.000
Random Matrix B
2.000 2.000
2.000 2.000
Matrix C = A*B
12.000 12.000
12.000 12.000
my rank: 3
Random Matrix A
3.000 3.000
3.000 3.000
Random Matrix B
3.000 3.000
3.000 3.000
Matrix C = A*B
22.000 22.000
22.000 22.000

cannon算法的原理及MPI C語言實現

linux安裝 openbabel pybel

Representation Learning for Attributed Multiplex Heterogeneous Network GATNE筆記

OpenMP“for”語句中的初始化格式不正確

python融合list速度比較

TransG翻譯

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結