迴歸Regression(一元線性迴歸、嶺迴歸、多元線性迴歸、多項式迴歸)

四、一元線性迴歸

1. 預測函數

輸入 輸出
0 1
1 3
2 5
3 7
4 9
  • 預測函數爲:y=1+2xy=1+2x

  • 預測:輸入10;輸出21

  • y=w0+w1xy=w_0+w_1x,任務就是尋找預測函數中的模型參數w0w_0w1w_1,以滿足輸入和輸出之間的聯繫。

2. 單樣本誤差

  • 隨機 xx 代入 y=w0+w1xy=w_0+w_1x 得到 yy' ,則單樣本誤差爲:e=(yy)22e=\frac{(y-y')^2}{2}

3. 總樣本誤差

  • 總樣本誤差爲:E=(yy)22E=\sum {\frac{(y-y')^2}{2}}

4. 損失函數

  • 損失函數爲:Loss(w0,w1)=(y(w0+w1x))22Loss(w_0,w_1)=\sum {\frac{(y-(w_0+w_1x))^2}{2}}

  • 任務就是尋找可以使損失函數取得最小值的模型參數 w0w_0w1w_1

5. 梯度下降法尋優

  1. 隨機選擇一組模型參數 w0w_0w1w_1 ,計算損失函數在該模型參數處的梯度 [Loss/w0,Loss/w1][\partial{Loss}/\partial{w_0}, \partial{Loss}/\partial{w_1}]

  2. 計算與該梯度反方向的修正步長[nLoss/w0,nLoss/w1][-n\partial{Loss}/\partial{w_0}, -n\partial{Loss}/\partial{w_1}]

  3. 計算下一組模型參數
    w0=w0nLoss/w0w_0=w_0-n\partial{Loss}/\partial{w_0}
    w1=w1nLoss/w1w_1=w_1-n\partial{Loss}/\partial{w_1}

  4. 直到滿足迭代終止條件:

    • 迭代足夠多次;
    • 損失值已經足夠小;
    • 損失值已經不再明顯減少。

Loss(w0,w1)=(y(w0+w1x))22Loss(w_0,w_1)=\sum {\frac{(y-(w_0+w_1x))^2}{2}}

Loss/w0=(yy)22/w0=yy\partial{Loss}/\partial{w_0}=\sum{\partial{\frac{(y-y')^2}{2}}/\partial{w_0}}=\sum{y-y'}

Loss/w1=(yy)22/w1=(yy)x\partial{Loss}/\partial{w_1}=\sum{\partial{\frac{(y-y')^2}{2}}/\partial{w_1}}=\sum{(y-y')x}

# gd.py
import numpy as np
import matplotlib.pyplot as mp
from mpl_toolkits.mplot3d import axes3d

# 樣本輸入
train_x = np.array([0.5, 0.6, 0.8, 1.1, 1.4])
# 樣本輸出
train_y = np.array([5.0, 5.5, 6.0, 6.8, 7.0])
# 設置迭代次數
n_epoches = 1000
# 學習率設置爲0.01,向量長度
lrate = 0.01
# 記錄迭代次數和損失值
epoches, losses = [], []
# 設置初始值爲1,1
w0, w1 = [1], [1]

# 開始迭代
for epoch in range(1, n_epoches + 1):
    epoches.append(epoch)  # 添加迭代次數
    # 計算本次損失值並添加,公式參考上述公式
    losses.append(((train_y - (w0[-1] + w1[-1] * train_x)) ** 2 / 2).sum())
    # 格式化打印當前數據
#     print('{:4}> w0={:.8f}, w1={:.8f}, loss={:.8f}'.format(epoches[-1], w0[-1], w1[-1], losses[-1]))
    # 關於dw0的損失值偏微分
    d0 = -(train_y - (w0[-1] + w1[-1] * train_x)).sum()
    # 關於dw1的損失值偏微分
    d1 = -((train_y - (w0[-1] + w1[-1] * train_x)) * train_x).sum()
    # 追加w0w1
    w0.append(w0[-1] - lrate * d0)
    w1.append(w1[-1] - lrate * d1)

# 刪掉多算的一個
w0 = np.array(w0[:-1])
w1 = np.array(w1[:-1])

# 提取排序索引
sorted_indices = train_x.argsort()
# 提取順序測試集
test_x = train_x[sorted_indices]
test_y = train_y[sorted_indices]
# 預測輸出
pred_test_y = w0[-1] + w1[-1] * test_x

# 計算損失值
grid_w0, grid_w1 = np.meshgrid(np.linspace(0, 9, 500), np.linspace(0, 3.5, 500))
flat_w0, flat_w1 = grid_w0.ravel(), grid_w1.ravel()
# 計算扁平化損失值
flat_loss = (((flat_w0 + np.outer(train_x, flat_w1)) - train_y.reshape(-1, 1)) ** 2).sum(axis=0) / 2
# 網格化損失值
grid_loss = flat_loss.reshape(grid_w0.shape)

# ————————————————————————————————————————————————————————————
# 繪製線性迴歸圖
mp.figure('Linear Regression', dpi=120)
mp.title('Linear Regression', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
# 訓練數據點
mp.scatter(train_x, train_y, marker='s',
           c='dodgerblue', alpha=0.5, s=80,
           label='Training')
# 測試數據點
mp.scatter(test_x, test_y, marker='D',
           c='orangered', alpha=0.5, s=60,
           label='Testing')
# 預測數據點
mp.scatter(test_x, pred_test_y, c='orangered',
           alpha=0.5, s=60, label='Predicted')

# 獲取誤差值,並且連線
for x, y, pred_y in zip(
        test_x, test_y, pred_test_y):
    mp.plot([x, x], [y, pred_y], c='orangered',
            alpha=0.5, linewidth=1)
# 繪製迴歸線,即根據迴歸方程繪製的直線    
mp.plot(test_x, pred_test_y, '--', c='limegreen',
        label='Regression', linewidth=1)
mp.legend()


# ————————————————————————————————————————————————————————
mp.figure('Training Progress', dpi=120)
# 繪製過程圖像w0
mp.subplot(311)
mp.title('Training Progress', fontsize=20)
mp.ylabel('w0', fontsize=14)
mp.gca().xaxis.set_major_locator(mp.MultipleLocator(100))
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')

mp.plot(epoches, w0, c='dodgerblue', label='w0')
mp.legend()

# 繪製過程圖像w1
mp.subplot(312)
mp.ylabel('w1', fontsize=14)
mp.gca().xaxis.set_major_locator(mp.MultipleLocator(100))
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')

mp.plot(epoches, w1, c='limegreen', label='w1')
mp.legend()

# 繪製過程圖像,迭代次數和損失值的關係
mp.subplot(313)
mp.xlabel('epoch', fontsize=14)
mp.ylabel('loss', fontsize=14)
mp.gca().xaxis.set_major_locator(mp.MultipleLocator(100))
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')

mp.plot(epoches, losses, c='orangered', label='loss')
mp.legend()
mp.tight_layout()

# 繪製損失函數
mp.figure('Loss Function', dpi=120)
ax = mp.gca(projection='3d')
mp.title('Loss Function', fontsize=20)
ax.set_xlabel('w0', fontsize=14)
ax.set_ylabel('w1', fontsize=14)
ax.set_zlabel('loss', fontsize=14)
mp.tick_params(labelsize=10)

ax.plot_surface(grid_w0, grid_w1, grid_loss, rstride=10, cstride=10, cmap='jet')
ax.plot(w0, w1, losses, 'o-', c='orangered', label='BGD')
mp.legend()

# 扁平化
mp.figure('Batch Gradient Descent', dpi=120)
mp.title('Batch Gradient Descent', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')

mp.contourf(grid_w0, grid_w1, grid_loss, 1000, cmap='jet')
cntr = mp.contour(grid_w0, grid_w1, grid_loss, 10,colors='black', linewidths=0.5)
mp.clabel(cntr, inline_spacing=0.1, fmt='%.2f', fontsize=8)
mp.plot(w0, w1, 'o-', c='orangered', label='BGD')



mp.legend()
mp.show()

在這裏插入圖片描述

在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述

6. 工具包

import sklearn.linear_model as lm
線性迴歸器 = lm.LinearRegression()
線性迴歸器.fit(已知輸入, 已知輸出)     # 計算模型參數
線性迴歸器.predict(新的輸入)          ->新的輸出

import pickle  #  保存模型
import numpy as np
import sklearn.linear_model as lm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
import pickle

# 讀取數據流程
x, y = [], []
with open('../data/single.txt', 'r') as f:
    for line in f.readlines():
        data = [float(substr) for substr in line.split(',')]
        x.append(data[:-1])
        y.append(data[-1])

# 轉換爲array
x = np.array(x)
y = np.array(y)

# 載入模型
# with open('../data/linear.pkl', 'rb') as f:
#     model = pickle.load(f)
    
# 建立線性模型
model = lm.LinearRegression()
# 計算模型參數
model.fit(x, y)
# 預測新的輸出
pred_y = model.predict(x)
# 查看效果1/(1+E) 誤差越大,越接近0;誤差越小,接近1
print(sm.r2_score(y, pred_y))

# 保存模型
# with open('../../data/linear.pkl', 'wb') as f:
#     pickle.dump(model, f)
    
# 繪製迴歸圖形
mp.figure('Linear Regression', dpi=120)
mp.title('Linear Regression', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.scatter(x, y, c='dodgerblue', alpha=0.75, s=60, label='Sample')
# 將x變爲一維數組,按照排序繪製點
sorted_indices = x.ravel().argsort()
mp.plot(x[sorted_indices], pred_y[sorted_indices], c='orangered', label='Regression')

mp.legend()
mp.show()
0.7362638998481811

在這裏插入圖片描述

五、嶺迴歸

  • 通過正則的方法,即在損失函數中加入正則項,以減弱模型參數對熟練數據的匹配度,藉以規避少數明顯偏移正常範圍的異常樣本影響模型的迴歸效果。

Loss(w0,w1)=(y(w0+w1x))22+×f(w0,w1)Loss(w_0,w_1)=\sum{\frac{(y-(w_0+w_1x))^2}{2}}+正則強度 \times f(w_0,w_1)

  • lm.Ridge(300, fit_intercept=True) (正則強度,是否約束)
# rdg.py
import numpy as np
import sklearn.linear_model as lm
import matplotlib.pyplot as mp

x, y = [], []
with open('../data/abnormal.txt', 'r') as f:
    for line in f.readlines():
        data = [float(substr) for substr
                in line.split(',')]
        x.append(data[:-1])
        y.append(data[-1])
x = np.array(x)
y = np.array(y)

# 普通線性迴歸
model1 = lm.LinearRegression()
model1.fit(x, y)
pred_y1 = model1.predict(x)

# 嶺迴歸,若正則強度爲0,則爲普通線性迴歸
model2 = lm.Ridge(300, fit_intercept=True)
model2.fit(x, y)
pred_y2 = model2.predict(x)

mp.figure('Linear & Ridge Regression', dpi=120)
mp.title('Linear & Ridge Regression', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')

mp.scatter(x, y, c='dodgerblue', alpha=0.75, s=60, label='Sample')
sorted_indices = x.ravel().argsort()
mp.plot(x[sorted_indices], pred_y1[sorted_indices], c='orangered', label='Linear')
mp.plot(x[sorted_indices], pred_y2[sorted_indices], c='limegreen', label='Ridge')

mp.legend()
mp.show()

在這裏插入圖片描述

六、多項式迴歸

  • 多元線性:y=w0+w1x1+w2x2+w3x3+...+wnxny=w_0+w_1x_1+w_2x_2+w_3x_3+...+w_nx_n

  • x2,x3...xnx^2, x^3 ... x^n 看作 x1,x2,...x_1, x_2, ...

  • 一元多項式:y=w0+w1x+w2x2+w3x3+...+wnxny=w_0+w_1x+w_2x^2+w_3x^3+...+w_nx^n

                                      x->多項式特徵擴展器 -x1...xn-> 線性迴歸器->w0...wn
                                        \______________________________________/
                                                                 管線
    
# poly.py
import numpy as np
import sklearn.pipeline as pl
import sklearn.preprocessing as sp
import sklearn.linear_model as lm
import sklearn.metrics as sm
import matplotlib.pyplot as mp

train_x, train_y = [], []
with open('../data/single.txt', 'r') as f:
    for line in f.readlines():
        data = [float(substr) for substr
                in line.split(',')]
        train_x.append(data[:-1])
        train_y.append(data[-1])
        
train_x = np.array(train_x)
train_y = np.array(train_y)
model = pl.make_pipeline(sp.PolynomialFeatures(10), lm.LinearRegression()) # 先sp預處理,參數爲多項式最高次數,再使用lm方法
model.fit(train_x, train_y)
pred_train_y = model.predict(train_x)
print(sm.r2_score(train_y, pred_train_y))

test_x = np.linspace(train_x.min(), train_x.max(), 1000).reshape(-1, 1)
pred_test_y = model.predict(test_x)

mp.figure('Polynomial Regression', dpi=120)
mp.title('Polynomial Regression', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.scatter(train_x, train_y, c='dodgerblue', alpha=0.75, s=60, label='Sample')
mp.plot(test_x, pred_test_y, c='orangered', label='Regression')

mp.legend()
mp.show()
0.7868629092058498

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章