算法:
輸入:兩個的矩陣,個處理器。
輸出:若是完全平方數且,則計算並輸出。
算法思想:將的矩陣分割成塊,即每行每列均有個分塊矩陣,那麼每個分塊的行列都等於。將這些分塊分給個處理器,即處理器管理分塊,並計算對應分塊的結果。初始時將分塊循環左移步,分塊循環上移步。接下來是運算過程,計算並將結果放置到中,計算完成後循環左移一步,循環上移一步,重複這個過程次即可計算出最終的,然後由根處理器收集結果即可。
圖示:
環境:
#include <iostream>
#include <cstdio>
#include <mpi.h>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <cmath>
#include <algorithm>
#include <vector>
using namespace std;
const int n = 5; //n是矩陣大小
int MatrixA[n][n], MatrixB[n][n], MatrixC[n][n]; //三個矩陣 已知A B 計算C=A*B
int block, blocknum; //每個分塊的大小(一行有多少元素) blocknum=block*block
int numprocs, sqrnumprocs; //前者爲處理器的個數 後者爲其根號
int move_size; //=blocknum*sizeof(int) 用於memcpy memset等函數
int* blockA, * blockB, * blockC, * tmpa, * tmpb; //存儲 分塊矩陣 以及傳輸數據所需要的緩衝區
int myid, row, col; //處理器ID 把整個矩陣劃分成若干個分塊矩陣分給其它處理器 則該處理器處理第row行 第clo列的分塊矩陣
inline void init_AB()//初始化矩陣 A B
{
srand(time(0));
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
MatrixA[i][j] = rand() % 10;
MatrixB[i][j] = rand() % 10;
}
}
}
inline void send_block_AB()
{
int rowmin, rowmax, colmin, colmax; //記錄分塊矩陣的範圍
for (int i = 0; i < numprocs; i++)
{
rowmin = (i / sqrnumprocs) * block;
rowmax = rowmin + block;
colmin = (i % sqrnumprocs) * block;
colmax = colmin + block;
for (int j = rowmin; j < rowmax; j++)
{
for (int k = colmin; k < colmax; k++)
{
int idx = (j - rowmin) * block + k - colmin; //由於tmp是一維數組 所以要計算當前元素對應的下標
tmpa[idx] = MatrixA[j][k];
tmpb[idx] = MatrixB[j][k];
}
}
if (!i) //0號處理器
{
memcpy(blockA, tmpa, move_size);
memcpy(blockB, tmpb, move_size);
}
else
{ //發送分塊矩陣 A B
MPI_Send(tmpa, blocknum, MPI_INT, i, 1, MPI_COMM_WORLD);
MPI_Send(tmpb, blocknum, MPI_INT, i, 2, MPI_COMM_WORLD);
}
}
}
inline int getidx(int row, int col) //通過 分塊矩陣的 行row 列col 得到管理它的 處理器ID
{
//row=id/sqrnumprocs col=id%sqrnumprocs
return ((row + sqrnumprocs) % sqrnumprocs) * sqrnumprocs + (col + sqrnumprocs) % sqrnumprocs;
}
inline void init_move() //初始時的移動操作 A中分塊(i,j)左移i步 B中分塊(i,j)上移j步
{
MPI_Status s;
//發送並接受對應的分塊
MPI_Sendrecv(blockA, blocknum, MPI_INT, getidx(row, col - row), 1, tmpa, blocknum, MPI_INT, getidx(row, col + row), 1, MPI_COMM_WORLD, &s);
MPI_Sendrecv(blockB, blocknum, MPI_INT, getidx(row - col, col), 2, tmpb, blocknum, MPI_INT, getidx(row + col, col), 2, MPI_COMM_WORLD, &s);
//拷貝
memcpy(blockA, tmpa, move_size);
memcpy(blockB, tmpb, move_size);
}
inline void cal() //計算過程
{
MPI_Status s;
for (int times = 0; times < sqrnumprocs; times++) //sqrnumprocs次 乘法和累加
{
for (int i = 0; i < block; i++) //c[i][j]=a[i][k]*b[k][j]
{ //c[i][j]=blockC[i * block + j]
for (int j = 0; j < block; j++)
{
int sum = blockC[i * block + j];
for (int k = 0; k < block; k++)
sum += blockA[i * block + k] * blockB[k * block + j];
blockC[i * block + j] = sum;
}
} //每個分塊計算完畢後
//A中分塊左移1步 B中分塊上移1步
MPI_Sendrecv(blockA, blocknum, MPI_INT, getidx(row, col - 1), 1, tmpa, blocknum, MPI_INT, getidx(row, col + 1), 1, MPI_COMM_WORLD, &s);
MPI_Sendrecv(blockB, blocknum, MPI_INT, getidx(row - 1, col), 2, tmpb, blocknum, MPI_INT, getidx(row + 1, col), 2, MPI_COMM_WORLD, &s);
//拷貝
memcpy(blockA, tmpa, move_size);
memcpy(blockB, tmpb, move_size);
}
}
inline void getans() //處理器0 從其餘處理器處得到分塊矩陣的結果併合並
{
MPI_Status s;
int rowmin, rowmax, colmin, colmax;
//處理器0 可直接得到
for (int i = 0; i < block; i++)
for (int j = 0; j < block; j++)
MatrixC[i][j] = blockC[i * block + j];
//其餘的需要 接收
for (int i = 1; i < numprocs; i++)
{
MPI_Recv(blockC, blocknum, MPI_INT, i, 1, MPI_COMM_WORLD, &s);
rowmin = (i / sqrnumprocs) * block; //首行座標
rowmax = rowmin + block; //最後一行的座標
colmin = (i % sqrnumprocs) * block; //首列座標
colmax = colmin + block; //最後一列的座標
for (int j = rowmin; j < rowmax; j++)
for (int k = colmin; k < colmax; k++)
MatrixC[j][k] = blockC[(j - rowmin) * block + k - colmin];
}
}
inline void print_matrix(int ans[][n]) //輸出矩陣
{
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
printf("%-5d", ans[i][j]);
printf("\n");
}
printf("\n");
}
int main(int argc, char* argv[])
{
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs); //個數
MPI_Comm_rank(MPI_COMM_WORLD, &myid); //ID
clock_t start = clock(); //開始時間
sqrnumprocs = sqrt(numprocs);
if (sqrnumprocs * sqrnumprocs != numprocs || n % sqrnumprocs)
{
if (myid == 0)
{
if (n % sqrnumprocs == 0)
cout << "處理器個數應該爲完全平方數!\n";
else
cout << "sqrnumprocs必須整除矩陣大小n!\n";
}
MPI_Finalize();
return 0;
}
block = n/sqrnumprocs; //分塊大小
blocknum = block * block; //每個分塊的元素總數
move_size = blocknum * sizeof(int);
row = myid / sqrnumprocs; //計算自己處理的分塊矩陣的 座標
col = myid % sqrnumprocs;
blockA = new int[blocknum]; //分配空間
blockB = new int[blocknum];
blockC = new int[blocknum];
tmpa = new int[blocknum];
tmpb = new int[blocknum];
memset(blockC, 0, move_size); //初始化c
if (!myid) //0號處理器
{
init_AB(); //初始化矩陣A B
send_block_AB(); //計算分塊矩陣 並將其發送給其餘處理器
}
else
{ //接受0號發過來的 分塊矩陣
MPI_Status s;
MPI_Recv(blockA, blocknum, MPI_INT, 0, 1, MPI_COMM_WORLD, &s);
MPI_Recv(blockB, blocknum, MPI_INT, 0, 2, MPI_COMM_WORLD, &s);
}
init_move(); //初始時分塊矩陣的移動
cal(); //計算過程
if (myid == 0)
{
getans();
cout << "矩陣A爲:\n";
print_matrix(MatrixA);
cout << "矩陣B爲:\n";
print_matrix(MatrixB);
cout << "矩陣C=A*B爲(cannon乘法):\n";
print_matrix(MatrixC);
clock_t end = clock(); //結束時間
cout << "Cannon乘法耗時: " << end - start << "\n";
}
else
{
MPI_Send(blockC, blocknum, MPI_INT, 0, 1, MPI_COMM_WORLD);
}
delete[] blockA;
delete[] blockB;
delete[] blockC;
delete[] tmpa;
delete[] tmpb;
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
return 0;
}
算法:
輸入、輸出、環境都同算法。
算法思想:分塊部分的處理和算法是一樣的。在分完塊後,向所在行的其他處理器進行一到多播送,然後處理器將收到的分塊與自己的塊進行乘加運算,計算完成之後自己的分塊保持不變,分塊循環上移一步,如果是上次第行播送的塊,本次選擇向所在行的其它處理器進行一到多播送,然後進行乘加運算……進行次乘加運算後即可得到所有的,由根處理器收集結果即可。
圖示:
#include <iostream>
#include <cstdio>
#include <mpi.h>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <cmath>
#include <algorithm>
#include <vector>
using namespace std;
const int n = 1e3; //n是矩陣大小
int MatrixA[n][n], MatrixB[n][n], MatrixC[n][n]; //三個矩陣 已知A B 計算C=A*B
int block, blocknum; //每個分塊的大小(一行有多少元素) blocknum=block*block
int numprocs, sqrnumprocs; //前者爲處理器的個數 後者爲其根號
int move_size; //=blocknum*sizeof(int) 用於memcpy memset等函數
int* blockA, * blockB, * blockC, * tmpa, * tmpb; //存儲 分塊矩陣 以及傳輸數據所需要的緩衝區
int myid, row, col; //處理器ID 把整個矩陣劃分成若干個分塊矩陣分給其它處理器 則該處理器處理第row行 第clo列的分塊矩陣
inline void init_AB()//初始化矩陣 A B
{
srand(time(0));
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
MatrixA[i][j] = rand() % 10;
MatrixB[i][j] = rand() % 10;
}
}
}
inline void send_block_AB()
{
int rowmin, rowmax, colmin, colmax; //記錄分塊矩陣的範圍
for (int i = 0; i < numprocs; i++)
{
rowmin = (i / sqrnumprocs) * block;
rowmax = rowmin + block;
colmin = (i % sqrnumprocs) * block;
colmax = colmin + block;
for (int j = rowmin; j < rowmax; j++)
{
for (int k = colmin; k < colmax; k++)
{
int idx = (j - rowmin) * block + k - colmin; //由於tmp是一維數組 所以要計算當前元素對應的下標
tmpa[idx] = MatrixA[j][k];
tmpb[idx] = MatrixB[j][k];
}
}
if (!i) //0號處理器
{
memcpy(blockA, tmpa, move_size);
memcpy(blockB, tmpb, move_size);
}
else
{ //發送分塊矩陣 A B
MPI_Send(tmpa, blocknum, MPI_INT, i, 1, MPI_COMM_WORLD);
MPI_Send(tmpb, blocknum, MPI_INT, i, 2, MPI_COMM_WORLD);
}
}
}
inline int getidx(int row, int col) //通過 分塊矩陣的 行row 列col 得到管理它的 處理器ID
{
//row=id/sqrnumprocs col=id%sqrnumprocs
return ((row + sqrnumprocs) % sqrnumprocs) * sqrnumprocs + (col + sqrnumprocs) % sqrnumprocs;
}
inline void cal() //計算過程
{
MPI_Status s;
int send_col_idx = row; //在分塊矩陣的視圖上看 初始時 需要發送分塊矩陣(row,send_col_idx)
int idxmin, idxmax; //記錄 需要接收分塊的 處理器的id範圍
for (int times = 0; times < sqrnumprocs; times++) //sqrnumprocs次 乘法和累加
{
//該處理器處理的分塊的座標爲 (row,col)
//所以需要從 處理器 getidx(row,send_idx[row]) 處得到分塊A 然後進行乘法累加
if (col == send_col_idx)
{
idxmin = getidx(row, 0);
idxmax = getidx(row, sqrnumprocs - 1);
for (int i = idxmin; i <= idxmax; i++)
{
if (i == myid) //自己就沒必要發送了
continue;
MPI_Send(blockA, blocknum, MPI_INT, i, 1, MPI_COMM_WORLD);//發送
}
memcpy(tmpa, blockA, move_size); //直接拷貝到目標位置
}
else //接收分塊
{
MPI_Recv(tmpa, blocknum, MPI_INT, getidx(row, send_col_idx), 1, MPI_COMM_WORLD, &s);
}
send_col_idx = (send_col_idx + 1) % sqrnumprocs; //遞增列號
for (int i = 0; i < block; i++) //c[i][j]=a[i][k]*b[k][j]
{ //c[i][j]=blockC[i * block + j]
for (int j = 0; j < block; j++)
{
int sum = blockC[i * block + j];
for (int k = 0; k < block; k++)
sum += tmpa[i * block + k] * blockB[k * block + j];
blockC[i * block + j] = sum;
}
} //每個分塊計算完畢後
//A中分塊保持不動 B中分塊上移1步
MPI_Sendrecv(blockB, blocknum, MPI_INT, getidx(row - 1, col), 2, tmpb, blocknum, MPI_INT, getidx(row + 1, col), 2, MPI_COMM_WORLD, &s);
//拷貝
memcpy(blockB, tmpb, move_size);
}
}
inline void getans() //處理器0 從其餘處理器處得到分塊矩陣的結果併合並
{
MPI_Status s;
int rowmin, rowmax, colmin, colmax;
//處理器0 可直接得到
for (int i = 0; i < block; i++)
for (int j = 0; j < block; j++)
MatrixC[i][j] = blockC[i * block + j];
//其餘的需要 接收
for (int i = 1; i < numprocs; i++)
{
MPI_Recv(blockC, blocknum, MPI_INT, i, 1, MPI_COMM_WORLD, &s);
rowmin = (i / sqrnumprocs) * block; //首行座標
rowmax = rowmin + block; //最後一行的座標
colmin = (i % sqrnumprocs) * block; //首列座標
colmax = colmin + block; //最後一列的座標
for (int j = rowmin; j < rowmax; j++)
for (int k = colmin; k < colmax; k++)
MatrixC[j][k] = blockC[(j - rowmin) * block + k - colmin];
}
}
inline void print_matrix(int ans[][n]) //輸出矩陣
{
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
printf("%-5d", ans[i][j]);
printf("\n");
}
printf("\n");
}
int main(int argc, char* argv[])
{
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs); //個數
MPI_Comm_rank(MPI_COMM_WORLD, &myid); //ID
clock_t start = clock(); //開始時間
sqrnumprocs = sqrt(numprocs);
if (sqrnumprocs * sqrnumprocs != numprocs || n % sqrnumprocs)
{
if (myid == 0)
{
if (n % sqrnumprocs == 0)
cout << "處理器個數應該爲完全平方數!\n";
else
cout << "sqrnumprocs必須整除矩陣大小n!\n";
}
MPI_Finalize();
return 0;
}
block = n/sqrnumprocs; //分塊大小
blocknum = block * block; //每個分塊的元素總數
move_size = blocknum * sizeof(int);
row = myid / sqrnumprocs; //計算自己處理的分塊矩陣的 座標
col = myid % sqrnumprocs;
blockA = new int[blocknum]; //分配空間
blockB = new int[blocknum];
blockC = new int[blocknum];
tmpa = new int[blocknum];
tmpb = new int[blocknum];
memset(blockC, 0, move_size); //初始化c
if (!myid) //0號處理器
{
init_AB(); //初始化矩陣A B
send_block_AB(); //計算分塊矩陣 並將其發送給其餘處理器
}
else
{ //接受0號發過來的 分塊矩陣
MPI_Status s;
MPI_Recv(blockA, blocknum, MPI_INT, 0, 1, MPI_COMM_WORLD, &s);
MPI_Recv(blockB, blocknum, MPI_INT, 0, 2, MPI_COMM_WORLD, &s);
}
cal(); //計算過程
if (myid == 0)
{
getans();
//cout << "矩陣A爲:\n";
//print_matrix(MatrixA);
//cout << "矩陣B爲:\n";
//print_matrix(MatrixB);
//cout << "矩陣C=A*B爲:\n";
//print_matrix(MatrixC);
clock_t end = clock(); //結束時間
cout << "Fox乘法耗時: " << end - start << "\n";
}
else
{
MPI_Send(blockC, blocknum, MPI_INT, 0, 1, MPI_COMM_WORLD);
}
delete[] blockA;
delete[] blockB;
delete[] blockC;
delete[] tmpa;
delete[] tmpb;
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
return 0;
}