矩陣乘法的Map-Reduce實現

方法一：

已知Amn∗Bnp=Cmp

C i, j = \sum k = 1 n A i, k * B k, j

Example:

C = ⎛ ⎝ ⎜ ⎜ ⎜ 147102581130912 ⎞ ⎠ ⎟ ⎟ ⎟ * ⎛ ⎝ ⎜ 100111529 ⎞ ⎠ ⎟ = ⎛ ⎝ ⎜ ⎜ ⎜ 43401692324670202280 ⎞ ⎠ ⎟ ⎟ ⎟

如果直接模擬做的話可以爲如下的形式：

for(int i=0;i<n;i++){
    for(int j=0;j<p;j++){
        for(int k = 0;k<n;k++){
            C[i][j] = C[i][j] + A[i][k]*B[k][j];
        }
    }
}

通過模擬矩陣的乘法可以發現Ai,j 會和Bj,k,k∈[1,p] 都相乘一次，所以我們map階段可以這樣做。

將A 的元素變成(j,(′A′,i,Ai,j)) 的形式Ai,j≠0
將B 的元素變成(i,(′A′,j,Ai,j)) 的形式Bi,j≠0
Shuffle階段將key相同的value放到一個列表當中
在reduce的時候將key相同的來自不同矩陣的value值，做笛卡爾積
再將其map成((i,j),value) 的形式
Shuffle階段將key相同的value放到一個列表當中
在reduce的時候將key相同的value求和

Spark實現

#!/bin/python
#coding: utf-8
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
import numpy as np
#spark基本環境配置
conf=SparkConf().setMaster('local').setAppName('test')
sc=SparkContext(conf=conf)
hiveContext=HiveContext(sc)

matrix_1=sc.parallelize([(1,1,1),(1,2,1),(1,3,1),(2,1,1),(2,2,1),(2,3,1),(3,1,1),(3,2,1),(3,3,1)])
matrix_2=sc.parallelize([(1,1,1),(1,2,1),(1,3,1),(2,1,1),(2,2,1),(2,3,1),(3,1,1),(3,2,1),(3,3,1)])
temp_1=matrix_1.map(lambda x: (x[1],(x[0],x[2])))
temp_2=matrix_2.map(lambda x: (x[0],(x[1],x[2])))
temp=temp_1.cartesian(temp_2).filter(lambda x: x[0][0]==x[1][0])
ret=temp.map(lambda x: ((x[0][1][0],x[1][1][0]),x[0][1][1]*x[1][1][1])).reduceByKey(lambda x,y: x+y)
print temp.collect()
print ret.collect()

pyspark 官方文檔：傳送門

C++ Code

map

#include <iostream>
#include <string>
#include <vector>
#include <cstdio>
using namespace std;

void map()
{
    int x = 0;
    char tag;
    cin>>tag;
    getchar();
    string line;
    while(getline(cin,line)){
        vector<string> vc;
        string tmp = "";
        for(int i=0;i<line.length();i++){
            if(line[i]==' '){
                vc.push_back(tmp);
                tmp = "";
            }
            else
                tmp = tmp + line[i];
        }
        vc.push_back(tmp);
        for(int i=0;i<vc.size();i++){
            if(vc[i]=="0")
                continue;
            if(tag=='A')
                cout<<i<<" A "<<x<<" "<<vc[i]<<endl;
            else
                cout<<x<<" B "<<i<<" "<<vc[i]<<endl;
        }
        x++;
    }
}

int main()
{
    map();
    return 0;
}

reduce

#include <string>
#include <vector>
#include <cstdio>
#include <map>
#include <algorithm>
using namespace std;

typedef pair<int ,int > PII;

typedef map<PII,int> MPPII;

void reduce()
{
    string line;
    MPPII mpA,mpB,ans;
    while(getline(cin,line)){
        vector<int> vc;
        int tmp = 0;
        for(int i=0;i<line.length();i++){
            if(line[i]==' '){
                vc.push_back(tmp);
                tmp = 0;
            }
            else
                tmp = tmp*10 + line[i]-'0';
        }
        vc.push_back(tmp);
        if(vc[1]+'0'=='A')
            mpA[make_pair(vc[0],vc[2])]=vc[3];
        else
            mpB[make_pair(vc[0],vc[2])]=vc[3];
    }
    MPPII::iterator it1,it2;

    for(it1 = mpA.begin();it1!=mpA.end();it1++){
        for(it2 = mpB.begin();it2!=mpB.end();it2++){
            if(it1->first.first == it2->first.first)
                ans[make_pair(it1->first.second,it2->first.second)]+=(it1->second)*(it2->second);
        }
    }
    for(it1 = ans.begin();it1!=ans.end();it1++)
        cout<<(it1->first.first)<<" "<<(it1->first.second)<<" "<<it1->second<<endl;


}

int main()
{
    reduce();
    return 0;
}

run.sh

#!/bin/bash

if [ $# -ne 3 ];then
    echo "we need three args:matrixA,matrixB,outputfile"
    exit 3
fi

mapfile="$(pwd)/matrix_map.cpp"
mapExe="$(pwd)/map"
reducefile="$(pwd)/matrix_reduce.cpp"
reduceExe="$(pwd)/reduce"
res="$(pwd)/res"

echo "getting map.exe"
g++ -o ${mapExe} ${mapfile}
if [ $? -ne 0 ]; then
    exit 1
fi
echo "success!"

echo "getting reduce.exe"
g++ -o $reduceExe $reducefile
if [ $? -ne 0 ]; then
    exit 2
fi
echo "success!"

echo "mapping..."
cat $1 | $mapExe > $res
cat $2 | $mapExe >> $res
echo "success!"

echo "reducing..."
cat $res | $reduceExe > $3
echo "success!"

rm $res $mapExe $reduceExe

data

inputA:

inputB:

output

方法一的優點是：再大的矩陣也可以處理。缺點是：網絡IO太大，速度慢。

方法二：

對於A×B ，如果B 不是很大，可以把B放到分佈式緩存上，把A按行切分發送給多個Mapper Task，各個Mapper Task把B完全放入內存中。

矩陣乘法的Map-Reduce實現

方法一：

Spark實現

C++ Code

map

reduce

run.sh

data

方法二：

一鍵自動化博客發佈工具,用過的人都說好(頭條篇)

美團一面：項目中有 10000 個 if else 如何優化？想了半天，被問懵了！

京東面試：如何進行JVM調優？

01 穩定性（一）如何應對事故並做好覆盤？

線程池那些坑爹的參數-核心線程數&最大線程數&工作隊列

Stream流常用方法總結

Python實現邏輯迴歸

TemplateMethod 模板方法模式

CSAPP:網絡編程(一)IP相關

【tensorflow學習】實現卷積神經網絡

【tensorflow學習】Ftrl學習

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結