RBM源碼分析

RBM源碼

本源碼來自《深入淺出深度學習原理剖析與Python實踐》一書參考代碼：GitHub項目

數據預處理

# coding: utf-8
#
# data_preprocess.py
#
# Author: Huang Anbu
# Date: 2017.4
#
# Description: data processing, after execute this script, you will get new file "data.pkl", it
#   contains the following data structure:
#
#   - min_user_id
#   - max_user_id
#   - min_movie_id
#   - max_movie_id
#   - train_set
#
# Copyright©2017. All Rights Reserved. 
# ===============================================================================================


from basiclib import *

if __name__ == "__main__":
    #獲取當前目錄
    cur_dir = os.getcwd()
    #當前目錄後加上dataset
    path = os.path.join(cur_dir, "dataset")
    #打開當前目錄
    os.chdir(path)
    data = {}
    max_user_id, min_user_id = 0, 1000000
    max_movie_id, min_movie_id = 0, 1000000
    with open("ratings.dat", "rb") as fin:
        for line in fin:
            #把每一行數據進行分割，並把值轉化成int類型--map函數用法
            ls = map(lambda x:int(x), line.split("::"))
            user, movie, rate = ls[0], ls[1], ls[2]
            max_user_id = max(max_user_id, user)
            min_user_id = min(min_user_id, user)

            max_movie_id = max(max_movie_id, movie)
            min_movie_id = min(min_movie_id, movie)

            if user not in data:
                data[user] = [(movie, rate)]
            else:
                data[user].append((movie, rate))

    if min_user_id == 1:
        max_user_id = max_user_id - 1
        min_user_id = min_user_id - 1

    if min_movie_id == 1:
        max_movie_id = max_movie_id - 1
        min_movie_id = min_movie_id - 1

    train_set = numpy.zeros((max_user_id+1, max_movie_id+1))
    #進入當前目錄
    os.chdir(cur_dir)
    #生成數據處理的矩陣
    for k, v in data.iteritems():
        for m, r in v:
            train_set[k-1][m-1]=r 

    with open("data.pkl", "wb") as fout:
        cPickle.dump((min_user_id, max_user_id, min_movie_id, max_movie_id, train_set), fout)

模型參數設置

# coding: utf-8
#
# basiclib.py
#
# Author: Huang Anbu
# Date: 2017.3
#
# Description: Basic Configuration and Interface
# 
# options: hyper-parameter setting
#
# optimizer: optimization algorithm
#
# Copyright©2017. All Rights Reserved. 
# ===============================================================================================

import os 
import sys
import numpy
import theano
import time
import cPickle
import itertools
import glob, gzip
import theano.tensor as T
from theano.tensor.signal import pool
from theano.tensor.nnet import conv2d
import matplotlib.pyplot as plt
from collections import *
import optimization
from theano.tensor.shared_randomstreams import RandomStreams


options = {
    "batch_size" : 1,
    "lr" : 0.05,
    "cd_k" : 15,
    "n_hidden" : 100,
    "print_freq" : 50,
    "valid_freq" : 50,
    "n_epoch" : 100,
    "optimizer" : "adadelta"
}

optimizer = {"sgd" : optimization.sgd, 
            "momentum" : optimization.momentum, 
            "nesterov_momentum" : optimization.nesterov_momentum, 
            "adagrad" : optimization.adagrad,
            "adadelta" : optimization.adadelta,
            "rmsprop" : optimization.rmsprop}

訓練模型

# coding: utf-8
#
# rbm.py
#
# Author: Huang Anbu
# Date: 2017.4
#
# Description: rbm-based collaborative filtering algorithm
#
# contrastive divergence, which is proposed by Geoffrey Hinton, is most commonly use algorithm 
#   in training RBM model
#
# Dataset citation:
#   F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History
#       and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4,
#       Article 19 (December 2015), 19 pages. DOI=http://dx.doi.org/10.1145/2827872 
#
# Copyright©2017. All Rights Reserved. 
# ===============================================================================================

from __future__ import print_function
from basiclib import *

# rbm = RBM(input=x, n_visible=WS*5, n_hidden=n_hidden, numpy_rng=rng, theano_rng=theano_rng)   
class RBM(object):
    def __init__(
        self,
        input=None,
        n_visible=784,
        n_hidden=500,
        W=None,
        hbias=None,
        vbias=None,
        numpy_rng=None,
        theano_rng=None
    ):
        self.input = input
        self.n_visible = n_visible
        self.n_hidden = n_hidden

        if numpy_rng is None:
            # create a number generator
            numpy_rng = numpy.random.RandomState(1234)

        if theano_rng is None:
            # numpy.random.RandomState 的符號備份，可以實例化各種分佈的隨機變量
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        if W is None:
            initial_W = numpy.asarray(
                numpy_rng.uniform(
                    low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                    high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                    size=(n_visible, n_hidden)
                ),
                dtype=theano.config.floatX
            )
            # theano shared variables for weights and biases
            W = theano.shared(value=initial_W, name='W', borrow=True)

        if hbias is None:
            hbias = theano.shared(
                value=numpy.zeros(
                    n_hidden,
                    dtype=theano.config.floatX
                ),
                name='hbias',
                borrow=True
            )

        if vbias is None:
            vbias = theano.shared(
                value=numpy.zeros(
                    n_visible,
                    dtype=theano.config.floatX
                ),
                name='vbias',
                borrow=True
            )

        self.W = W
        self.hbias = hbias
        self.vbias = vbias
        self.theano_rng = theano_rng
        self.params = [self.W, self.hbias, self.vbias]
    #自由能量：F(v)
    def free_energy(self, v_sample):
        # w*v+b
        wx_b = T.dot(v_sample, self.W) + self.hbias
        # v*a
        vbias_term = T.dot(v_sample, self.vbias)
        # sum all hidden node :ln(1+exp(w*v+b))
        hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1)
        return -hidden_term - vbias_term

    #計算隱藏層神經元爲1的概率：P(h=1|v)
    def propup(self, vis):
        #計算s型激活函數的x值：w*v+hbias,其中v爲每個用戶的評分數據
        pre_sigmoid_activation = T.dot(vis, self.W) + self.hbias
        return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]

    #由可視層得到：1.s型激活函數的x值：w*v+hbia;2.隱藏層神經元爲1的概率：P(h=1|v),即條件分佈;3.隱藏成h的值(以P(h=1|v)的分佈採樣得到)
    def sample_h_given_v(self, v0_sample):
        #計算得到：1.s型激活函數的x值：w*v+hbia;2.隱藏層神經元爲1的概率：P(h=1|v)
        pre_sigmoid_h1, h1_mean = self.propup(v0_sample)
        #採樣隱藏層h的值：以試驗成功p=h1_mean的概率抽樣n=1次，返回成功次數；產生數據大小爲size=h1_mean.shape;
        h1_sample = self.theano_rng.binomial(size=h1_mean.shape, n=1, p=h1_mean, dtype=theano.config.floatX)
        return pre_sigmoid_h1, h1_mean, h1_sample

    def propdown(self, hid):
        pre_sigmoid_activation = T.dot(hid, self.W.T) + self.vbias
        return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]

    #由hidden層得到visible層
    def sample_v_given_h(self, mask, h0_sample):
        '''For recommendation, use softmax instead of sigmoid'''
        #計算得到：激活函數的x值
        pre_activation = T.dot(h0_sample, self.W.T) + self.vbias  # (n_visible, )
        #sz = pre_activation.shape[0]
        #變成：行數：所有電影的項目數，列數：5
        pre_activation = pre_activation.reshape((self.n_visible/5, 5))
        #得到一個0-4的值，代表每行的哪列位置最大
        state = T.argmax(pre_activation, axis=1)
        #初始化輸出結果爲：0
        output = T.zeros_like(pre_activation).astype(theano.config.floatX)
        #設置輸出state對應部分取1：set_subtensor(x,y),用y填充x，並返回x
        ret = T.set_subtensor(output[T.arange(state.shape[0]), state], 1.0).reshape(mask.shape)
        #乘以掩碼，去除對缺失值的預測
        return ret * mask
    #gibbs採樣：h-v-h；返回：1.可視層的採樣值；2.隱藏層s型激活函數的x；3.隱藏層神經元爲1的概率：P(h=1|v);4.隱藏層的採樣值
    def gibbs_hvh(self, h0_sample, mask):
        v1_sample = self.sample_v_given_h(mask, h0_sample)
        pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
        return [v1_sample, pre_sigmoid_h1, h1_mean, h1_sample]
    #gibbs採樣：v-h-v；返回：1.隱藏層的採樣值；2.可視層的採樣值
    def gibbs_vhv(self, v0_sample, mask):
        pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v0_sample)
        v1_sample = self.sample_v_given_h(mask, h1_sample)
        return [h1_sample, v1_sample]

    # start-snippet-2
    #cost, updates = rbm.get_cost_updates(mask, lr=lr, persistent=persistent_chain, k=cd_k)
    def get_cost_updates(self, mask, lr=0.1, persistent=None, k=1):
        #計算得到：1.s型激活函數的x值：w*v+hbia;2.隱藏層神經元爲1的概率：P(h=1|v),即條件分佈;3.隱藏成h的值(以P(h=1|v)的分佈採樣得到)
        pre_h_mean, h_mean, ph_sample = self.sample_h_given_v(self.input)

        if persistent is None:
            chain_start = ph_sample
        else:
            chain_start = persistent
        #循環k步，已知h0，採樣v1，由v1採樣h1,...
        (
            [
                nv_samples,
                pre_nh_mean, 
                nh_mean,
                nh_samples
            ],
            updates
        ) = theano.scan(
            self.gibbs_hvh,
            outputs_info=[None, None, None, chain_start],
            n_steps=k,
            non_sequences = [mask],
            name="gibbs_hvh"
        )

        #最後一次循環可見層的採樣值
        chain_end = nv_samples[-1]
        #損失函數cost：所有用戶的自由能F(v)的均值的差；
        cost = T.mean(self.free_energy(self.input)) - T.mean(self.free_energy(chain_end))

        #計算梯度：考慮知道chain_end不變。
        gparams = T.grad(cost, self.params, consider_constant=[chain_end])
        #gw = T.dot(self.input.reshape((self.n_visible, 1)), h_mean.reshape((1, self.n_hidden))) - T.dot(chain_end.reshape((self.n_visible, 1)), nh_mean[-1].reshape((1, self.n_hidden)))
        #ghbias = h_mean.reshape(self.hbias.shape) - nh_mean[-1].reshape(self.hbias.shape)
        #gvbias = self.input.reshape(self.vbias.shape) - chain_end.reshape(self.vbias.shape)

        #gparams = [gw, ghbias, gvbias]
        #模型參數更新
        for gparam, param in zip(gparams, self.params):
            # make sure that the learning rate is of the right dtype
            updates[param] = param - gparam * T.cast(
                lr,
                dtype=theano.config.floatX
            )
        if persistent:
            updates[persistent] = nh_samples[-1]

        return cost, updates

    def get_pseudo_likelihood_cost(self, updates):
        bit_i_idx = theano.shared(value=0, name='bit_i_idx')
        xi = T.round(self.input)
        fe_xi = self.free_energy(xi)
        xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx])
        fe_xi_flip = self.free_energy(xi_flip)

        cost = T.mean(self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip -
                                                            fe_xi)))

        updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible

        return cost

    def get_reconstruction_cost(self, updates, pre_sigmoid_nv):
        cross_entropy = T.mean(
            T.sum(
                self.input * T.log(T.nnet.sigmoid(pre_sigmoid_nv)) +
                (1 - self.input) * T.log(1 - T.nnet.sigmoid(pre_sigmoid_nv)),
                axis=1
            )
        )

        return cross_entropy
    #各個用戶可視層一次採樣前後的錯誤率  
    def get_reconstruction(self, x, mask):
        [h1_sample, v1_sample] = self.gibbs_vhv(x, mask)
        err = []
        x=x.astype('int8')
        v1_sample=v1_sample.astype('int8')
        return 1.0 - T.mean(T.all(T.eq(x, v1_sample).reshape((self.n_visible/5, 5)), axis=1))


def train_rbm():

    lr = options["lr"]
    batch_size = options["batch_size"] #塊大小爲1，一個用戶訓練數據
    n_hidden = options["n_hidden"]

    with open("data.pkl", "rb") as fin:
        min_user_id, max_user_id, min_movie_id, max_movie_id, train_set = cPickle.load(fin)

    print(min_user_id, max_user_id, min_movie_id, max_movie_id)

    HS, WS = train_set.shape
    #訓練數據的輸入：可視層神經元的值
    new_train_set = numpy.zeros((HS, WS*5))
    #訓練數據的掩碼：表示用戶對哪些電影有評分，訓練時，只用這部分數據；沒有的部分表示缺失值，也是我們需要預測的值
    new_train_mask = numpy.zeros((HS, WS*5))
    #對原來的評分數據進行編碼:3-->0,0,1,0,0
    for row in range(HS):
        for col in range(WS):
            r = int(train_set[row][col]) # (user, movie) = r 
            if r==0:
                continue
            new_train_set[row][col*5+r-1] = 1
            new_train_mask[row][col*5:col*5+5] = 1

    print(numpy.mean(new_train_mask))

    new_train_set = new_train_set.astype(theano.config.floatX)
    new_train_mask = new_train_mask.astype(theano.config.floatX)
    #塊大小
    n_train_batches = new_train_set.shape[0] // batch_size

    x = T.matrix('x')  # the data is presented as rasterized images
    mask = T.matrix('mask')
    cd_k = T.iscalar('cd_k')
    lr = T.scalar('lr', dtype=theano.config.floatX)
    #設置隨機數的種子，使得實驗結果可復現
    rng = numpy.random.RandomState(123)
    #theano的隨機數
    theano_rng = RandomStreams(rng.randint(2 ** 30))
    #borrow=True 意味着numpy.zeros((batch_size, n_hidden)參數值改變，persistent_chain.get_value() 也會跟着改變
    persistent_chain = theano.shared(numpy.zeros((batch_size, n_hidden), dtype=theano.config.floatX), borrow=True)

    # construct the RBM class
    rbm = RBM(input=x, n_visible=WS*5, n_hidden=n_hidden, numpy_rng=rng, theano_rng=theano_rng)
    #得到代價函數和參數更新
    cost, updates = rbm.get_cost_updates(mask, lr=lr, persistent=persistent_chain, k=cd_k)

    #訓練模型函數：train_model
    train_model = theano.function([x, mask, cd_k, lr], outputs=cost, updates=updates, name='train_rbm')
    #可視層一次採樣前後的誤差
    check_model = theano.function([x, mask], outputs=rbm.get_reconstruction(x, mask), name='check_model')
    numpy.set_printoptions(threshold='nan') 

    output = open("output_persistent_k3_lr0.1.txt", "wb")
    #20次迭代
    for epoch in range(20):
        mean_cost = []
        error = []
        p = []  
        #訓練數據行數，用戶數目
        for batch_index in range(n_train_batches):
            if epoch<3:
                #迭代不到3次採樣1次
                cd_k = 1
            else:
                #迭代到3次以後採樣次數線性增加
                cd_k = 2 + (epoch - 3)/2

            if epoch<3:
                lr = 0.05
            else:
                #迭代到3次以後學習率線性增加
                lr = 0.05 + ((epoch - 3)/2)*0.01
            #訓練數據塊大小    
            batch_data = new_train_set[batch_index*batch_size:(batch_index+1)*batch_size]
            #訓練數據掩碼
            batch_data_mask = new_train_mask[batch_index*batch_size:(batch_index+1)*batch_size]
            #
            mean_cost += [train_model(batch_data, batch_data_mask, cd_k, lr)]

            error += [check_model(batch_data, batch_data_mask)]

        p = []
        #代價函數
        print("epoch %d end, cost: %lf"%(epoch, numpy.mean(mean_cost)))
        #實際誤差率
        print("epoch %d end, error: %lf"%(epoch, numpy.mean(error)))
        #寫入文件
        print("%lf"%(numpy.mean(error)), file=output)


if __name__ == '__main__':
    train_rbm()
    #make_recom()

相似的RBM代碼分析鏈接

RBM源碼分析1
RBM源碼分析2

RBM源碼

數據預處理

模型參數設置

訓練模型

相似的RBM代碼分析鏈接

芯片產業管理和營銷指北（1）—— 產品線經理主要職能

記一次疑似JVM內存泄漏的排查過程

Python集成開發環境Anaconda2及Pytorch與Tensorflow的安裝

ubuntu16.04系統安裝(原有win7系統)

Hadoop學習平臺的搭建與pyspark平臺的安裝

Shell腳本函數

VScode環境配置-mac

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結