常見的機器學習算法(八) SVM支持向量機算法

直接調用sklearn的API:

from sklearn import svm                                #支持向量機#
module = svm.LinearSVC()
module.fit(x, y)
module.score(x, y)
module.predict(test)
module.predict_proba(test)

 完整代碼:

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model, svm, model_selection

#數據集:鳶尾花數據集
'''
數據數 150
數據類別 3 (setosa, versicolor, virginica)
每個數據包含4個屬性:sepal萼片長度、萼片寬度、petal花瓣長度、花瓣寬度
'''
def load_data_classification():
    iris = datasets.load_iris()
    x_train = iris.data
    y_train = iris.target
    """
    test_size:如果是浮點數,在0-1之間,表示樣本佔比;如果是整數的話就是樣本的數量;
    random_state:隨機數種子,種子不同,每次採的樣本不一樣;種子相同,採的樣本不變
    (random_state不取,採樣數據不同,但random_state等於某個值,採樣數據相同,取0的時候也相同;
    stratify: 時候按照一定的比例抽取樣本,這個參數很神奇。默認的情況是None, 給值得時候是也很神奇,給的是一個標籤序列。
    比如說,你將原數據集的y標籤給入超參。那麼隨機抽取的樣本是按照y標籤內樣本分佈抽取的
    """
    x_train, x_test, y_train, y_test = model_selection.train_test_split(x_train, y_train,
                                     test_size=0.25, random_state=0, stratify=y_train)
    return x_train, x_test, y_train, y_test

'''
調用默認線性分類函數,默認參數定義如下:
penalty = 'l2' 懲罰項
loss = 'squared_hinge' 合頁損失函數的平方
dual = True 解決對偶問題
tol = 0.0001 終止迭代的閾值
C = 1.0 懲罰參數
multi_class = 'ovr' 多分類問題的策略:採用 one-vs-rest策略
fit_intercept = True 計算截距,即決策函數中的常數項
intercept-scaling = 1 實例X變成向量[X, intercept-scaling],此時相當於添加了一個人工特徵,該特徵對所有實例都是常數值。
class-weight = None 認爲類權重是1
verbose = 0 表示不開啓verbose輸出
random_state = None 使用默認的隨機數生成器
max_iter = 1000 指定最大的迭代次數
'''
# 求得分類函數參數w、b ,並得出預測準確度
def test_LinearSVC(x_train, x_test, y_train, y_test):
    lsvc = svm.LinearSVC()#線性SVM算法
    lsvc.fit(x_train, y_train)#擬合
    """
    用於繪製支撐向量所在的直線:
    svc.coef_:算法模型的係數,有兩個值,因爲樣本有兩種特徵,每個特徵對應一個係數;
    係數:特徵與樣本分類結果的關係係數;
    svc.intercept_:模型的截距,一維向量,只有一個數,因爲只有一條直線;
    係數:w = svc.coef_
    截距:b = svc.intercept_
    決策邊界直線方程:w[0] * x0 + w[1] * x1 + b = 0
    支撐向量直線方程:w[0] * x0 + w[1] * x1 + b = ±1
    變形:
    決策邊界:x1 = -w[0]/w[1] * x0 - b/w[1]
    支撐向量:x1 = -w[0]/w[1] * x0 - b/w[1] ± 1/w[1]
    """
    print('各特徵權重Coefficients, 截距intercept: ', lsvc.coef_, lsvc.intercept_)
    print('算法評分Score: ', lsvc.score(x_test, y_test))

#探討2個不同損失函數loss對預測的影響
def test_LinearSVC_loss(x_train, x_test, y_train, y_test):
    """
    "hinge":用於最大間隔(maximum-margin)分類,其中最有代表性的就是支持向量機SVM;
    "squared_hinge":帶有二次懲罰的線性SVM的損失
    """
    losses = ['hinge', 'squared_hinge']
    for loss in losses:
        lsvc = svm.LinearSVC(loss=loss)
        lsvc.fit(x_train, y_train)
        print('損失函數Loss: ', loss)
        print('Coefficients, intercept: ', lsvc.coef_, lsvc.intercept_)
        print('Score: ', lsvc.score(x_test, y_test))

#探討懲罰項L1、L2對預測的影響
def test_LinearSVC_L12(x_train, x_test, y_train, y_test):
    L12 = ['l1', 'l2']
    for p in L12:
        lsvc = svm.LinearSVC(penalty=p, dual=False)
        lsvc.fit(x_train, y_train)
        print('懲罰項Penalty: ', p)
        print('Coefficients, intercept: ', lsvc.coef_, lsvc.intercept_)
        print('Score: ', lsvc.score(x_test, y_test))

#探討懲罰項係數C對預測的影響
#引入正則化係數C,可以理解爲允許劃分錯誤的權重(越大,越不允許出錯),當C較小時,允許少量樣例劃分錯誤
def test_LinearSVC_C(x_train, x_test, y_train, y_test):
    """
    def logspace(start, stop, num=50, endpoint=True, base=10.0, dtype=None,axis=0);
    np.logspace(-2, 1)即從10^-2到10^1,等間隔輸出50個數
    """
    C = np.logspace(-2, 1)
    train_scores = []
    test_scores = []
    for c in C:
        lsvc = svm.LinearSVC(C=c)
        lsvc.fit(x_train, y_train)
        train_scores.append(lsvc.score(x_train, y_train))
        test_scores.append(lsvc.score(x_test, y_test))

    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.plot(C, train_scores, label='Traing_score')
    ax.plot(C, test_scores, label='Testing_score')
    ax.set_title('test_LinearSVC_C')
    ax.set_xlabel(r'C')
    ax.set_ylabel(r'Score')
    ax.set_xscale('log')
    ax.legend(loc='best')
    plt.show()
    plt.savefig('test_LinearSVC_C.png')

if __name__ == '__main__':
    x_train, x_test, y_train, y_test = load_data_classification()

    print('結果#1:')
    test_LinearSVC(x_train, x_test, y_train, y_test)
    print()
    print('-------------------------------------')
    print('結果#2:')
    test_LinearSVC_loss(x_train, x_test, y_train, y_test)
    print()
    print('-------------------------------------')
    print('結果#3:')
    test_LinearSVC_L12(x_train, x_test, y_train, y_test)
    print()
    print('-------------------------------------')
    print('結果#4:')
    test_LinearSVC_C(x_train, x_test, y_train, y_test)
    print('finished!')

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章