以下爲
jupyter
轉markdown
結果,請結合上下文解讀或運行
import numpy as np
import matplotlib.pyplot as plt
meshgrid
- 全排列網格
對於
構成的則是一個的矩形方格,作爲二維平面基底,在網格的點上進行擡高或者下陷,就形成了三維空間的立體圖像。
def f(_X):
return _X[0] ** 2 + 5 * _X[1] ** 2
def g(_X):
return np.array([2*_X[0],10*_X[1]])
x = np.linspace(-200, 200, 1000)
y = np.linspace(-200, 200, 1000)
X, Y = np.meshgrid(x, y)
Z = X ** 2 + 5 * Y **2
def center():
plt.scatter(0, 0, marker="*", c='r')
def background():
plt.contour(X, Y, Z)
center()
background()
from IPython import display
def show_trail(get_trail_func, lr=0.018, iters=30):
start = np.array([150, 75], dtype='float64')
_trail = get_trail_func(start, lr, iters)
if _trail:
_trail = np.array(_trail)
length = len(_trail)
background()
for index in range(length - 1):
plt.plot(_trail[index:index+2,0], _trail[index:index+2, 1])
def simple_trail(init, lr, iters):
_trail = [init.copy()]
for _ in range(iters):
init -= np.array([50, 50])
_trail.append(init.copy())
return _trail
show_trail(simple_trail, iters=5)
GD
對於一次梯度更新,需要全部樣本參與計算,對算力有要求
def gd_trail(x, lr, iters):
_trail = [x.copy()]
for _ in range(iters):
grad = g(x)
x -= lr * grad
_trail.append(x.copy())
return _trail
show_trail(gd_trail, iters=50)
BGD
每次更新部分批次梯度
SGD
也就是每次只更新單個樣本,也就是每批次樣本數量時候的批次梯度更新
SGDM
這種更新辦法,攜帶以往衝量,避免局部最小值
def sgdm_trail(x, lr, iters, r=0.7, m=0):
iters = 30
_trail = [x.copy()]
for _ in range(iters):
grad = g(x)
m = r * m + lr * grad
x -= m
_trail.append(x.copy())
return _trail
show_trail(sgdm_trail)
NAG
使用超前梯度進行更新,每次更新幅度大一些,收斂更快
def nag_trail(x, lr, iters, r = 0.7, m=0):
_trail = [x.copy()]
for _ in range(iters):
grad = g(x)
m = r*m + lr * grad * (x - r*m)
x -= m
_trail.append(x.copy())
return _trail
show_trail(nag_trail, lr=0.0001)
Adagrad
自適應學習率
- 前期應該快速學習,快速進行收斂
- 後期應該慢慢移動,避免反覆橫跳
def adagrad_trail(x, lr, iters):
e = np.exp(-8)
s = np.zeros_like(x)
_trail = [x.copy()]
for _ in range(iters):
grad = g(x)
s += grad * grad
x -= lr * grad * np.power( np.sqrt(s + e), -1)
_trail.append(x)
return _trail
show_trail(adagrad_trail, lr=100)
如果導數積累過大,學習率衰減厲害,可以設置巨大學習率。
RMSProp
學習率累加應該有權重,近期權重大,久遠的權重低
def rmsprop_trail(x, lr, iters, r=0.9):
_trail = [x.copy()]
s = np.zeros_like(x)
e = np.exp(-8)
for _ in range(iters):
grad = g(x)
s += r * grad * grad
x -= grad * lr * np.power(np.sqrt(s + e), -1)
_trail.append(x.copy())
return _trail
show_trail(rmsprop_trail, lr=20)
移動更加平緩。
Adam
自適應學習率和動量結合?
def adam_trail(x, lr, iters, b_1=0.7, b_2=0.8, m=0):
e = np.exp(-8)
s = np.zeros_like(x)
_b_1 = 1 - b_1
_b_2 = 1 - b_2
_trail = [x.copy()]
for t in range(1, iters+1):
grad = g(x)
# print(f"grad : {grad}")
_m = b_1 * m + _b_1 * grad
_s = b_2 * s + _b_2 * grad * grad
m = np.divide(_m, 1 - np.power(b_1, t))
s = np.divide(_s, 1 - np.power(b_2, t))
x -= lr * m * np.power(np.sqrt(s + e), -1)
_trail.append(x.copy())
return _trail
show_trail(adam_trail, lr=10, iters=30)
np.divide(np.array([3,6,9]), 3)
array([1., 2., 3.])
Warmap
煉丹術:前期使用比較小的學習率學習幾輪,然後轉向正常梯度下降方法,效果更佳
使用階段
- 初期:
- 中期:
- 後期:
廣泛使用,特殊場景使用特殊梯度下降