SVM —— 利用完整Platt SMO算法加速優化

Platt SMO算法是通過一個外循環來選擇第一個alpha值的,並且其選擇過程會在兩種方式之間進行交替:

  • 一種是在所有數據集上進行單遍掃描
  • 另一種方式則是在非邊界alpha中實現單遍掃描。(所謂非邊界alpha指的是那些不等於邊界0或者C的alpha值。)

在選擇第一個alpha值之後,算法會通過一個內循環來選擇第二個alpha值。在優化過程中,通過最大化步長的方式來獲得第二個alpha值。

 

import numpy as np


class Optimization:
    def __init__(self, data_mat_in, class_labels, c, toler):
        self.x = data_mat_in
        self.label_mat = class_labels
        self.c = c
        self.toler = toler
        self.m = np.shape(data_mat_in)[0]
        self.alphas = np.mat(np.zeros((self.m, 1)))
        self.b = 0
        self.e_cache = np.mat(np.zeros((self.m, 2)))


def load_data_set(file_name):
    data_mat = []
    label_mat = []
    file = open(file_name)
    for line in file.readlines():
        line_array = line.strip().split('\t')
        data_mat.append([float(line_array[0]), float(line_array[1])])
        label_mat.append(float(line_array[2]))
    return data_mat, label_mat


def select_rand_j(i, m):
    j = i
    while j == i:
        j = int(np.random.uniform(0, m))
    return j


def calc_ek(opt, k):
    fxk = float(np.multiply(opt.alphas, opt.label_mat).T * (opt.x * opt.x[k, :].T)) + opt.b
    ek = fxk - float(opt.label_mat[k])
    return ek


def clip_alpha(aj, high, low):
    if aj > high:
        aj = high
    if aj < low:
        aj = low
    return aj


def select_j(i, opt, e_i):
    max_k = -1
    max_delta_e = 0
    e_j = 0
    opt.e_cache[i] = [1, e_i]
    valid_e_cache_lsit = np.nonzero(opt.e_cache[:, 0].A)[0]
    if (len(valid_e_cache_lsit)) > 1:
        for k in valid_e_cache_lsit:
            if k == i:
                continue
            e_k = calc_ek(opt, k)
            delta_e = abs(e_i - e_k)
            if delta_e > max_delta_e:
                max_k = k
                max_delta_e = delta_e
                e_j = e_k
        return max_k, e_j
    else:
        j = select_rand_j(i, opt.m)
        e_j = calc_ek(opt, j)
        return j, e_j


def update_e_k(opt, k):
    e_k = calc_ek(opt, k)
    opt.e_cache[k] = [1, e_k]


def inner_loop(i, opt):
    e_i = calc_ek(opt, i)
    if (opt.label_mat[i] * e_i < -opt.toler and opt.alphas[i] < opt.c) \
            or (opt.label_mat[i] * e_i > opt.toler and opt.alphas[i] > 0):
        j, e_j = select_j(i, opt, e_i)
        alpha_i_old = opt.alphas[i].copy()
        alpha_j_old = opt.alphas[j].copy()
        if opt.label_mat[i] != opt.label_mat[j]:
            low = max(0, opt.alphas[j] - opt.alphas[i])
            high = min(opt.c, opt.c + opt.alphas[j] - opt.alphas[i])
        else:
            low = max(0, opt.alphas[j] + opt.alphas[i] - opt.c)
            high = min(opt.c, opt.alphas[j] + opt.alphas[i])
        if low == high:
            print("low == high")
            return 0
        eta = 2.0 * opt.x[i, :] * opt.x[j, :].T - opt.x[i, :] * opt.x[i, :].T \
            - opt.x[j, :] * opt.x[j, :].T
        if eta >= 0:
            print("eta >= 0")
            return 0
        opt.alphas[j] -= opt.label_mat[j] * (e_i - e_j) / eta
        opt.alphas[j] = clip_alpha(opt.alphas[j], high, low)
        update_e_k(opt, j)
        if abs(opt.alphas[j] - alpha_j_old) < 0.00001:
            print("j not moving enough")
            return 0
        opt.alphas[i] += opt.label_mat[j] * opt.label_mat[i] * (alpha_j_old - opt.alphas[j])
        update_e_k(opt, i)
        b1 = opt.b - e_i - opt.label_mat[i] * (opt.alphas[i] - alpha_i_old) \
            * opt.x[i, :] * opt.x[i, :].T - opt.label_mat[j] \
            * (opt.alphas[j] - alpha_j_old) * opt.x[i, :] * opt.x[j, :].T
        b2 = opt.b - e_j - opt.label_mat[i] * (opt.alphas[i] - alpha_i_old) \
            * opt.x[i, :] * opt.x[j, :].T - opt.label_mat[j] \
            * (opt.alphas[j] - alpha_j_old) * opt.x[j, :] * opt.x[j, :].T
        if 0 < opt.alphas[i] < opt.c:
            opt.b = b1
        elif 0 < opt.alphas[j] < opt.c:
            opt.b = b2
        else:
            opt.b = (b1 + b2)/2.0
        return 1
    else:
        return 0


def smo_algorithm(data_mat_in, class_labels, c, toler, max_iter, ktup=('lin', 0)):
    opt = Optimization(np.mat(data_mat_in), np.mat(class_labels).transpose(), c, toler)
    iter = 0
    entire_set = True
    alpha_paires_changed = 0
    # 退出外循環的條件: 當迭代次數超過指定的最大值,或者遍歷整個集合都未對任意alpha對進行修改時,就退出循環
    while iter < max_iter and (alpha_paires_changed > 0 or entire_set):
        alpha_paires_changed = 0
        if entire_set:
            # 在數據集上遍歷任意可能的alpha
            for i in range(opt.m):
                alpha_paires_changed += inner_loop(i, opt)
            print("fullSet, iter: %d, i: %d, pairs changed %d" % (iter, i, alpha_paires_changed))
            iter += 1
        else:
            non_bound_is = np.nonzero((opt.alphas.A > 0) * (opt.alphas.A < c))[0]
            # 遍歷所有的非邊界alpha值,也就是不在邊界0或者c上的值
            for i in non_bound_is:
                alpha_paires_changed += inner_loop(i, opt)
                print("non-bound, iter: %d, i:%d, pairs changed: %d" % (iter, i, alpha_paires_changed))
            iter += 1
        # 在非邊界循環和邊界循環之間進行切換
        if entire_set:
            entire_set = False
        elif alpha_paires_changed == 0:
            entire_set = True
        print("iteration number: %d" % iter)
    return opt.b, opt.alphas


if __name__ == '__main__':
    data_arr, label_arr = load_data_set('testSet.txt')
    b, alphas = smo_algorithm(data_arr, label_arr, 0.6, 0.001, 40)

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章