【圖表示學習】實現DeepWalk

  1. DeepWalk是一種圖嵌入算法,其原理見【圖表示學習】word2vec與DeepWalk
  2. DeepWalk的源代碼Github爲https://github.com/phanein/deepwalk
  3. 本文是對源代碼的簡化和重新整理,方便讀者可以快速的理解DeepWalk的原理
  4. 源代碼包含了並行處理、數據序列化來處理大規模的數據,但是本文將這部分刪除
  5. 源代碼主要在文件夾deepwalk下,代碼入口爲文件__main__.py;數據在文件夾example_graphs
from six import iterkeys
from six.moves import range, zip, zip_longest
from collections import defaultdict, Iterable
from scipy.io import loadmat
from scipy.sparse import issparse
from gensim.models import Word2Vec, KeyedVectors
from sklearn.utils import shuffle as skshuffle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle as skshuffle
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

import random
import numpy

import warnings
warnings.filterwarnings('ignore')
D:\work\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working
  This is separate from the ipykernel package so we can avoid doing imports until

一、定義圖的數據結構

使用鄰接表的方式存儲圖

class Graph(defaultdict):
    """
    以字典的形式存儲圖信息(也就是鄰接表),其中key是結點的編號,value是相鄰結點編號組成的list
    """
    def __init__(self):
        super(Graph, self).__init__(list)
        
    def nodes(self):
        """返回圖中的所有結點"""
        return self.keys()
    
    def adjacency_iter(self):
        """返回鄰接表"""
        return self.items()
    
    def subgraph(self, nodes={}):
        """給定頂點集合nodes,返回對於的子圖"""
        subgraph = Graph()
        for n in nodes:
            if n in self:
                subgraph[n] = [x for x in self[n] if x in nodes]
        return subgraph
    
    def check_self_loops(self):
        """檢測自循環(也就是某個結點的相鄰節點包含自己的情況)"""
        for x in self:
            for y in self[x]:
                if x==y:
                    return True
    
    def remove_self_loops(self):
        """刪除自循環"""
        for x in self:
            if x in self[x]:
                self[x].remove(x)
        return self
    
    def make_consistent(self):
        """對鄰接表中的相鄰節點進行排序並去除自循環"""
        for k in iterkeys(self):
            self[k] = list(sorted(set(self[k])))
        self.remove_self_loops()
        return self
    
    def make_undirected(self):
        """轉換爲無向圖"""
        for v in list(self):
            for other in self[v]:
                if v != other:
                    self[other].append(v)
        self.make_consistent()
        return self
    
    def has_edge(self, v1, v2):
        """判斷兩頂點間是否有邊"""
        if v2 in self[v1] or v1 in self[v2]:
            return True
        return False
    
    def degree(self, nodes=None):
        """返回給定頂點的度"""
        if isinstance(nodes, Iterable):
            return {v:len(self[v]) for v in nodes}
        else:
            return len(self[nodes])
        
    def order(self):
        return len(self)
    
    def number_of_edges(self):
        """圖中邊的數目"""
        return sum([self.degree(x) for x in self.keys()])/2 # 所有頂點度的和再除以2
    
    def number_of_nodes(self):
        """圖中頂點的數目"""
        return self.order()
    
    def random_walk(self, path_length, alpha=0, rand=random.Random(), start=None):
        """
        返回截斷隨機遊走
        path_length:隨機遊走的長度
        alpha:重新開始的概率
        start:隨機遊走的起點
        """
        G = self
        if start:
            path = [start]
        else:
            path = [rand.choice(list(G.keys()))]
            
        while len(path)<path_length:
            cur = path[-1]
            # 度大於0的點,也就是有相鄰節點的點
            if len(G[cur]) > 0:
                if rand.random() >= alpha:
                    path.append(rand.choice(G[cur]))
                else:
                    # 以一定的概率重新回到出發頂點
                    path.append(path[0])
            else:
                break
        return [str(node) for node in path]

二、加載數據爲Graph對象

1.輔助函數

def grouper(n, iterable, padvalue=None):
    "grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
    return zip_longest(*[iter(iterable)]*n, fillvalue=padvalue)

2.加載鄰接表數據(adjlist)

數據格式爲:

1 2 3 4 5 6 7 8 9 11 12 13 14 18 20 22 32

2 1 3 4 8 14 18 20 22 31

3 1 2 4 8 9 10 14 28 29 33

4 1 2 3 8 13 14

5 1 7 11

6 1 7 11 17

其中每行的第一個數表示圖中的某個點,後面一系列數表示與該點相鄰的點。

def load_adjacencylist(file_, undirected=False, chunksize=10000,unchecked=True):
    """
    每chunksize個頂點的連接信息爲一個chunk
    """
    parse_func = parse_adjacencylist_unchecked
    convert_func = from_adjlist_unchecked
        
    adjlist = []
    with open(file_) as f:
        for idx, adj_chunk in enumerate(map(parse_func, grouper(int(chunksize), f))):
            adjlist.extend(adj_chunk)
    G = convert_func(adjlist)
    
    # 轉換爲無向圖
    if undirected:
        G = G.make_undirected()
        
    return G

def parse_adjacencylist_unchecked(f):
    """
    輸入:('1 2 3', '2 1','3 1')
    輸出:[[1,2,3],[2,1],[3,1]]
    """
    adjlist = []
    for l in f:
        if l and l[0] != "#":
            adjlist.extend([[int(x) for x in l.strip().split()]])
    return adjlist

def from_adjlist_unchecked(adjlist):
    """
    輸入:[[1,2,3],[2,1],[3,1]]
    輸出:實例化Graph,例如{1:[2,3],2:[1],3:[1]}
    """
    G = Graph()
    for row in adjlist:
        node = row[0]
        neighbors = row[1:]
        G[node] = neighbors
    return G

G1 = load_adjacencylist('./data/karate.adjlist')
print(G1)
Graph(<class 'list'>, {1: [2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 18, 20, 22, 32], 2: [1, 3, 4, 8, 14, 18, 20, 22, 31], 3: [1, 2, 4, 8, 9, 10, 14, 28, 29, 33], 4: [1, 2, 3, 8, 13, 14], 5: [1, 7, 11], 6: [1, 7, 11, 17], 7: [1, 5, 6, 17], 8: [1, 2, 3, 4], 9: [1, 3, 31, 33, 34], 10: [3, 34], 11: [1, 5, 6], 12: [1], 13: [1, 4], 14: [1, 2, 3, 4, 34], 15: [33, 34], 16: [33, 34], 17: [6, 7], 18: [1, 2], 19: [33, 34], 20: [1, 2, 34], 21: [33, 34], 22: [1, 2], 23: [33, 34], 24: [26, 28, 30, 33, 34], 25: [26, 28, 32], 26: [24, 25, 32], 27: [30, 34], 28: [3, 24, 25, 34], 29: [3, 32, 34], 30: [24, 27, 33, 34], 31: [2, 9, 33, 34], 32: [1, 25, 26, 29, 33, 34], 33: [3, 9, 15, 16, 19, 21, 23, 24, 30, 31, 32, 34], 34: [9, 10, 14, 15, 16, 19, 20, 21, 23, 24, 27, 28, 29, 30, 31, 32, 33]})

3.加載邊表數據(edgelist)

數據格式爲:

0 1

0 2

3 7

3 8

其中每一行代表連接兩個頂點的邊。

def load_edgelist(file_, undirected=True):
    G = Graph()
    with open(file_) as f:
        for l in f: # 讀取每行的數據
            x, y = l.strip().split()[:2]
            x = int(x)
            y = int(y)
            G[x].append(y)
            if undirected: # 無向圖,則加相反的邊
                G[y].append(x)
    G.make_consistent()
    return G
G2 = load_edgelist("./data/p2p-Gnutella08.edgelist")
print(G2.number_of_nodes())
6301

4.加載鄰接矩陣數據(mat)

def load_matfile(file_, variable_name="network", undirected=True):
    mat_varables = loadmat(file_)
    mat_matrix = mat_varables[variable_name]
    return from_numpy(mat_matrix, undirected)

def from_numpy(x, undirected=True):
    G = Graph()
    # 如果是稀疏矩陣格式
    if issparse(x):
        cx = x.tocoo() # 轉換爲coo matrix形式
        for i,j,v in zip(cx.row, cx.col, cx.data):
            G[i].append(j)
    else:
        raise Exception("Dense matrices not yet supported.")
    if undirected:
        G.make_undirected()

    G.make_consistent()
    return G

G3 = load_matfile("./data/blogcatalog.mat")
print(G3.number_of_nodes())
10312

三、生成語料

1.參數

seed = 0
number_walks = 80 # 在全圖上執行隨機遊走的次數(每次都會對圖中的所有點進行隨機遊走)
walk_length = 40 # 隨機遊走的長度
num_walks = len(G3.nodes()) * number_walks # 總共產生的遊走序列的數量
data_size = num_walks * walk_length
representation_size = 128 # 詞向量的維度
window_size = 10 # word2vec訓練時窗口的大小
workers = 4 # 並行進程數

2.生成語料

def build_deepwalk_corpus(G, num_paths, path_length, alpha=0, rand=random.Random(0)):
    walks = []
    nodes = list(G.nodes())
    for cnt in range(num_paths):
        rand.shuffle(nodes)
        for node in nodes:
            walks.append(G.random_walk(path_length, rand=rand, alpha=alpha, start=node))
    return walks

walks = build_deepwalk_corpus(G3, 
                              num_paths=number_walks,
                              path_length=walk_length,
                              alpha=0,
                              rand=random.Random(seed))
print(len(walks))
print(len(walks[0]))
print(walks[0])
824960
40
['597', '4373', '1360', '7894', '4162', '4445', '1452', '1635', '3764', '8343', '8762', '8323', '5255', '4175', '445', '1230', '1704', '327', '3197', '3280', '3695', '3857', '855', '1555', '4414', '862', '2357', '686', '1969', '2009', '3338', '4560', '5090', '6622', '6072', '4637', '5050', '4804', '7072', '6916']

3.使用word2vec訓練

model = Word2Vec(walks,
                 size=representation_size,
                 window=window_size,
                 min_count=0, sg=1, hs=1,
                 workers=workers)
model.wv.save_word2vec_format("blogcatalog.embeddings")

四、評估

1.參數

embeddings_file = "blogcatalog.embeddings"
matfile = "./data/blogcatalog.mat"
adj_matrix_name = "network"
label_matrix_name = "group"
num_shuffles = 10

2.輔助函數

def sparse2graph(x):
    """將稀疏鄰接矩陣轉換爲由字典表示的鄰接表"""
    G = defaultdict(lambda: set())
    cx = x.tocoo()
    for i,j,v in zip(cx.row, cx.col, cx.data):
        G[i].add(j)
    return {str(k):[str(x) for x in v] for k,v in G.items()}

class TopKRanker(OneVsRestClassifier):
    def predict(self, X, top_k_list):
        assert X.shape[0] == len(top_k_list)
        probs = numpy.asarray(super(TopKRanker, self).predict_proba(X))
        all_labels = []
        for i, k in enumerate(top_k_list):
            probs_ = probs[i, :]
            labels = self.classes_[probs_.argsort()[-k:]].tolist()
            all_labels.append(labels)
        return all_labels

3.加載數據

# 加載詞向量
model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)

# 加載標籤
mat = loadmat(matfile)
A = mat[adj_matrix_name]
graph = sparse2graph(A)
labels_matrix = mat[label_matrix_name] # (頂點數,標籤類別)
labels_count = labels_matrix.shape[1]

# 多標籤二值化轉換
mlb = MultiLabelBinarizer(range(labels_count))
# 詞向量矩陣
features_matrix = numpy.asarray([model[str(node)] for node in range(len(graph))])

# 數據shuffle
shuffles = [] # 用於保存多次shuffle的結果
for x in range(num_shuffles):
    shuffles.append(skshuffle(features_matrix, labels_matrix))
    
# 用於保存結果
all_results = defaultdict(list)
# 訓練集和測試集的劃分比較分別爲0.1、0.5和0.9
training_percents = [0.1, 0.5, 0.9]

4.訓練頂點分類模型並評估效果

# 不同的訓練集和測試集劃分比較
for train_percent in training_percents:
    # 執行num_shuffles次訓練
    for shuf in shuffles:
        X, y = shuf
        # 劃分訓練集
        training_size = int(train_percent*X.shape[0])
        X_train = X[:training_size,:]
        y_train_ = y[:training_size]
        y_train = [[] for _ in range(y_train_.shape[0])]
        cy =  y_train_.tocoo()
        for i,j in zip(cy.row, cy.col):
            y_train[i].append(j)
            
        assert sum(len(l) for l in y_train) == y_train_.nnz
        # 劃分測試集
        X_test = X[training_size:,:]
        y_test_ = y[training_size:]
        y_test = [[] for _ in range(y_test_.shape[0])]
        cy = y_test_.tocoo()
        for i,j in zip(cy.row, cy.col):
            y_test[i].append(j)
        
        # 訓練模型
        clf = TopKRanker(LogisticRegression(solver='lbfgs'))
        clf.fit(X_train, y_train_)
        
        # 模型預測
        top_k_list = [len(l) for l in y_test]
        preds = clf.predict(X_test, top_k_list)
        
        # 模型評估
        results = {}
        averages = ["micro", "macro"]
        for average in averages:
            results[average] = f1_score(mlb.fit_transform(y_test), mlb.fit_transform(preds), average=average)
        all_results[train_percent].append(results)

print('Results, using embeddings of dimensionality', X.shape[1])
print('-------------------')
for train_percent in sorted(all_results.keys()):
    print ('Train percent:', train_percent)
    for index, result in enumerate(all_results[train_percent]):
        print ('Shuffle #%d:   ' % (index + 1), result)
    avg_score = defaultdict(float)
    for score_dict in all_results[train_percent]:
        for metric, score in score_dict.items():
            avg_score[metric] += score
    for metric in avg_score:
        avg_score[metric] /= len(all_results[train_percent])
    print ('Average score:', dict(avg_score))
    print ('-------------------')
Results, using embeddings of dimensionality 128
-------------------
Train percent: 0.1
Shuffle #1:    {'micro': 0.3581986673814812, 'macro': 0.2033224239333088}
Shuffle #2:    {'micro': 0.3652487714987715, 'macro': 0.21577524088832908}
Shuffle #3:    {'micro': 0.3623166141792765, 'macro': 0.21066689705245478}
Shuffle #4:    {'micro': 0.3620649919336253, 'macro': 0.21171869067147162}
Shuffle #5:    {'micro': 0.35988947731982507, 'macro': 0.20222950493659103}
Shuffle #6:    {'micro': 0.35737906636929934, 'macro': 0.21090059048705442}
Shuffle #7:    {'micro': 0.3616287094547964, 'macro': 0.20471820148032435}
Shuffle #8:    {'micro': 0.3655955211289209, 'macro': 0.21922779915266496}
Shuffle #9:    {'micro': 0.35839791299010204, 'macro': 0.21189205464993427}
Shuffle #10:    {'micro': 0.35914952410193424, 'macro': 0.2078152517515965}
Average score: {'micro': 0.36098692563580326, 'macro': 0.20982666550037302}
-------------------
Train percent: 0.5
Shuffle #1:    {'micro': 0.4175778546712803, 'macro': 0.27454889378894143}
Shuffle #2:    {'micro': 0.4148700939745716, 'macro': 0.27046150803807273}
Shuffle #3:    {'micro': 0.41626129256428074, 'macro': 0.2696294691077568}
Shuffle #4:    {'micro': 0.41131664853101196, 'macro': 0.2687751830941183}
Shuffle #5:    {'micro': 0.4086511885019347, 'macro': 0.2628792857558395}
Shuffle #6:    {'micro': 0.42060622914349277, 'macro': 0.2676333163255493}
Shuffle #7:    {'micro': 0.4159658072521715, 'macro': 0.2685014990215809}
Shuffle #8:    {'micro': 0.4115448504983389, 'macro': 0.271229381068436}
Shuffle #9:    {'micro': 0.41506565307532833, 'macro': 0.27456128536082786}
Shuffle #10:    {'micro': 0.41693180246230455, 'macro': 0.2744381873233862}
Average score: {'micro': 0.41487914206747156, 'macro': 0.27026580088845087}
-------------------
Train percent: 0.9
Shuffle #1:    {'micro': 0.42203742203742206, 'macro': 0.25880516173753676}
Shuffle #2:    {'micro': 0.4207232267037552, 'macro': 0.28166124425360894}
Shuffle #3:    {'micro': 0.43083275980729524, 'macro': 0.28905126447658364}
Shuffle #4:    {'micro': 0.4314789687924016, 'macro': 0.2977340602872717}
Shuffle #5:    {'micro': 0.44467640918580376, 'macro': 0.289030378588737}
Shuffle #6:    {'micro': 0.4166666666666667, 'macro': 0.2857354498771346}
Shuffle #7:    {'micro': 0.44308111792774363, 'macro': 0.3100004806293008}
Shuffle #8:    {'micro': 0.42270194986072424, 'macro': 0.28487379305227356}
Shuffle #9:    {'micro': 0.451985559566787, 'macro': 0.2897217493931982}
Shuffle #10:    {'micro': 0.428067700987306, 'macro': 0.2626375242068254}
Average score: {'micro': 0.4312251781535906, 'macro': 0.28492511065024706}
-------------------
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章