四、一元線性迴歸
1. 預測函數
輸入 | 輸出 |
---|---|
0 | 1 |
1 | 3 |
2 | 5 |
3 | 7 |
4 | 9 |
… | … |
-
預測函數爲:
-
預測:輸入10;輸出21
-
,任務就是尋找預測函數中的模型參數和,以滿足輸入和輸出之間的聯繫。
2. 單樣本誤差
- 隨機 代入 得到 ,則單樣本誤差爲:
3. 總樣本誤差
- 總樣本誤差爲:
4. 損失函數
-
損失函數爲:
-
任務就是尋找可以使損失函數取得最小值的模型參數 和 。
5. 梯度下降法尋優
-
隨機選擇一組模型參數 和 ,計算損失函數在該模型參數處的梯度
-
計算與該梯度反方向的修正步長
-
計算下一組模型參數
-
直到滿足迭代終止條件:
- 迭代足夠多次;
- 損失值已經足夠小;
- 損失值已經不再明顯減少。
# gd.py
import numpy as np
import matplotlib.pyplot as mp
from mpl_toolkits.mplot3d import axes3d
# 樣本輸入
train_x = np.array([0.5, 0.6, 0.8, 1.1, 1.4])
# 樣本輸出
train_y = np.array([5.0, 5.5, 6.0, 6.8, 7.0])
# 設置迭代次數
n_epoches = 1000
# 學習率設置爲0.01,向量長度
lrate = 0.01
# 記錄迭代次數和損失值
epoches, losses = [], []
# 設置初始值爲1,1
w0, w1 = [1], [1]
# 開始迭代
for epoch in range(1, n_epoches + 1):
epoches.append(epoch) # 添加迭代次數
# 計算本次損失值並添加,公式參考上述公式
losses.append(((train_y - (w0[-1] + w1[-1] * train_x)) ** 2 / 2).sum())
# 格式化打印當前數據
# print('{:4}> w0={:.8f}, w1={:.8f}, loss={:.8f}'.format(epoches[-1], w0[-1], w1[-1], losses[-1]))
# 關於dw0的損失值偏微分
d0 = -(train_y - (w0[-1] + w1[-1] * train_x)).sum()
# 關於dw1的損失值偏微分
d1 = -((train_y - (w0[-1] + w1[-1] * train_x)) * train_x).sum()
# 追加w0w1
w0.append(w0[-1] - lrate * d0)
w1.append(w1[-1] - lrate * d1)
# 刪掉多算的一個
w0 = np.array(w0[:-1])
w1 = np.array(w1[:-1])
# 提取排序索引
sorted_indices = train_x.argsort()
# 提取順序測試集
test_x = train_x[sorted_indices]
test_y = train_y[sorted_indices]
# 預測輸出
pred_test_y = w0[-1] + w1[-1] * test_x
# 計算損失值
grid_w0, grid_w1 = np.meshgrid(np.linspace(0, 9, 500), np.linspace(0, 3.5, 500))
flat_w0, flat_w1 = grid_w0.ravel(), grid_w1.ravel()
# 計算扁平化損失值
flat_loss = (((flat_w0 + np.outer(train_x, flat_w1)) - train_y.reshape(-1, 1)) ** 2).sum(axis=0) / 2
# 網格化損失值
grid_loss = flat_loss.reshape(grid_w0.shape)
# ————————————————————————————————————————————————————————————
# 繪製線性迴歸圖
mp.figure('Linear Regression', dpi=120)
mp.title('Linear Regression', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
# 訓練數據點
mp.scatter(train_x, train_y, marker='s',
c='dodgerblue', alpha=0.5, s=80,
label='Training')
# 測試數據點
mp.scatter(test_x, test_y, marker='D',
c='orangered', alpha=0.5, s=60,
label='Testing')
# 預測數據點
mp.scatter(test_x, pred_test_y, c='orangered',
alpha=0.5, s=60, label='Predicted')
# 獲取誤差值,並且連線
for x, y, pred_y in zip(
test_x, test_y, pred_test_y):
mp.plot([x, x], [y, pred_y], c='orangered',
alpha=0.5, linewidth=1)
# 繪製迴歸線,即根據迴歸方程繪製的直線
mp.plot(test_x, pred_test_y, '--', c='limegreen',
label='Regression', linewidth=1)
mp.legend()
# ————————————————————————————————————————————————————————
mp.figure('Training Progress', dpi=120)
# 繪製過程圖像w0
mp.subplot(311)
mp.title('Training Progress', fontsize=20)
mp.ylabel('w0', fontsize=14)
mp.gca().xaxis.set_major_locator(mp.MultipleLocator(100))
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(epoches, w0, c='dodgerblue', label='w0')
mp.legend()
# 繪製過程圖像w1
mp.subplot(312)
mp.ylabel('w1', fontsize=14)
mp.gca().xaxis.set_major_locator(mp.MultipleLocator(100))
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(epoches, w1, c='limegreen', label='w1')
mp.legend()
# 繪製過程圖像,迭代次數和損失值的關係
mp.subplot(313)
mp.xlabel('epoch', fontsize=14)
mp.ylabel('loss', fontsize=14)
mp.gca().xaxis.set_major_locator(mp.MultipleLocator(100))
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(epoches, losses, c='orangered', label='loss')
mp.legend()
mp.tight_layout()
# 繪製損失函數
mp.figure('Loss Function', dpi=120)
ax = mp.gca(projection='3d')
mp.title('Loss Function', fontsize=20)
ax.set_xlabel('w0', fontsize=14)
ax.set_ylabel('w1', fontsize=14)
ax.set_zlabel('loss', fontsize=14)
mp.tick_params(labelsize=10)
ax.plot_surface(grid_w0, grid_w1, grid_loss, rstride=10, cstride=10, cmap='jet')
ax.plot(w0, w1, losses, 'o-', c='orangered', label='BGD')
mp.legend()
# 扁平化
mp.figure('Batch Gradient Descent', dpi=120)
mp.title('Batch Gradient Descent', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.contourf(grid_w0, grid_w1, grid_loss, 1000, cmap='jet')
cntr = mp.contour(grid_w0, grid_w1, grid_loss, 10,colors='black', linewidths=0.5)
mp.clabel(cntr, inline_spacing=0.1, fmt='%.2f', fontsize=8)
mp.plot(w0, w1, 'o-', c='orangered', label='BGD')
mp.legend()
mp.show()
6. 工具包
import sklearn.linear_model as lm
線性迴歸器 = lm.LinearRegression()
線性迴歸器.fit(已知輸入, 已知輸出) # 計算模型參數
線性迴歸器.predict(新的輸入) ->新的輸出
import pickle # 保存模型
import numpy as np
import sklearn.linear_model as lm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
import pickle
# 讀取數據流程
x, y = [], []
with open('../data/single.txt', 'r') as f:
for line in f.readlines():
data = [float(substr) for substr in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
# 轉換爲array
x = np.array(x)
y = np.array(y)
# 載入模型
# with open('../data/linear.pkl', 'rb') as f:
# model = pickle.load(f)
# 建立線性模型
model = lm.LinearRegression()
# 計算模型參數
model.fit(x, y)
# 預測新的輸出
pred_y = model.predict(x)
# 查看效果1/(1+E) 誤差越大,越接近0;誤差越小,接近1
print(sm.r2_score(y, pred_y))
# 保存模型
# with open('../../data/linear.pkl', 'wb') as f:
# pickle.dump(model, f)
# 繪製迴歸圖形
mp.figure('Linear Regression', dpi=120)
mp.title('Linear Regression', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.scatter(x, y, c='dodgerblue', alpha=0.75, s=60, label='Sample')
# 將x變爲一維數組,按照排序繪製點
sorted_indices = x.ravel().argsort()
mp.plot(x[sorted_indices], pred_y[sorted_indices], c='orangered', label='Regression')
mp.legend()
mp.show()
0.7362638998481811
五、嶺迴歸
- 通過正則的方法,即在損失函數中加入正則項,以減弱模型參數對熟練數據的匹配度,藉以規避少數明顯偏移正常範圍的異常樣本影響模型的迴歸效果。
lm.Ridge(300, fit_intercept=True)
(正則強度,是否約束)
# rdg.py
import numpy as np
import sklearn.linear_model as lm
import matplotlib.pyplot as mp
x, y = [], []
with open('../data/abnormal.txt', 'r') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y)
# 普通線性迴歸
model1 = lm.LinearRegression()
model1.fit(x, y)
pred_y1 = model1.predict(x)
# 嶺迴歸,若正則強度爲0,則爲普通線性迴歸
model2 = lm.Ridge(300, fit_intercept=True)
model2.fit(x, y)
pred_y2 = model2.predict(x)
mp.figure('Linear & Ridge Regression', dpi=120)
mp.title('Linear & Ridge Regression', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.scatter(x, y, c='dodgerblue', alpha=0.75, s=60, label='Sample')
sorted_indices = x.ravel().argsort()
mp.plot(x[sorted_indices], pred_y1[sorted_indices], c='orangered', label='Linear')
mp.plot(x[sorted_indices], pred_y2[sorted_indices], c='limegreen', label='Ridge')
mp.legend()
mp.show()
六、多項式迴歸
-
多元線性:
-
將 看作
-
一元多項式:
x->多項式特徵擴展器 -x1...xn-> 線性迴歸器->w0...wn \______________________________________/ 管線
# poly.py
import numpy as np
import sklearn.pipeline as pl
import sklearn.preprocessing as sp
import sklearn.linear_model as lm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
train_x, train_y = [], []
with open('../data/single.txt', 'r') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
train_x.append(data[:-1])
train_y.append(data[-1])
train_x = np.array(train_x)
train_y = np.array(train_y)
model = pl.make_pipeline(sp.PolynomialFeatures(10), lm.LinearRegression()) # 先sp預處理,參數爲多項式最高次數,再使用lm方法
model.fit(train_x, train_y)
pred_train_y = model.predict(train_x)
print(sm.r2_score(train_y, pred_train_y))
test_x = np.linspace(train_x.min(), train_x.max(), 1000).reshape(-1, 1)
pred_test_y = model.predict(test_x)
mp.figure('Polynomial Regression', dpi=120)
mp.title('Polynomial Regression', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.scatter(train_x, train_y, c='dodgerblue', alpha=0.75, s=60, label='Sample')
mp.plot(test_x, pred_test_y, c='orangered', label='Regression')
mp.legend()
mp.show()
0.7868629092058498