Load Dataset
from sklearn.datasets import load_boston
data = load_boston()
X, y = data['data'], data['target']
X, y = data['data'], data['target']
X[1]
y[1]
X.shape
len(y)
%matplotlib inline
import matplotlib.pyplot as plt
plt.scatter(X[:, 5], y)
目標:就是要找一個“最佳”的直線,來擬合臥室和房價的關係
import random
k, b = random.randint(-100, 100), random.randint(-100, 100)
def func(x):
return k*x + b
X_rm = X[:, 5]
y_hat = [func(x) for x in X_rm]
plt.scatter(X[:, 5], y)
plt.plot(X_rm, y_hat)
隨機畫了一根直線,結果發現,離得很遠?🙁
def draw_room_and_price():
plt.scatter(X[:, 5], y)
def price(x, k, b):
return k*x + b
k, b = random.randint(-100, 100), random.randint(-100, 100)
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]
print('the random k : {}, b: {}'.format(k, b))
draw_room_and_price()
plt.scatter(X_rm, price_by_random_k_and_b)
目標是想找到最“好”的K和b?
我們需要一個標準去衡量這個東西到底好不好
y_true,
衡量y_true, -> 損失函數
y_true = [1, 4, 1, 4,1, 4, 1,4]
y_hat = [2, 3, 1, 4, 1, 41, 31, 3]
L1-Loss
y_ture = [3, 4, 4]
y_hat_1 = [1, 1, 4]
y_hat_2 = [3, 4, 0]
L1-Loss 值是多少呢? |3 - 1| + |4-1|+ |4 -4| = 2 + 2 + 0 = 4
L1-Loss |3-3| + |4-4|+|4-0| = 4
def loss(y, y_hat):
sum_ = sum([(y_i - y_hat_i) ** 2 for y_i, y_hat_i in zip(y, y_hat)])
return sum_ / len(y)
y_ture = [3, 4, 4]
y_hat_1 = [1, 1, 4]
y_hat_2 = [3, 4, 0]
print(loss(y_ture, y_hat_1))
print(loss(y_ture, y_hat_2))
def price(x, k, b):
return k*x + b
k, b = random.randint(-100, 100), random.randint(-100, 100)
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]
print('the random k : {}, b: {}'.format(k, b))
draw_room_and_price()
plt.scatter(X_rm, price_by_random_k_and_b)
cost = loss(list(y), price_by_random_k_and_b)
print('The Loss of k: {}, b: {} is {}'.format(k, b, cost))
Loss 一件事情你只要知道如何評價它好與壞 基本上就完成了一般了工作了
最簡單的方法,我們隨機生成若干組k和b,然後找到最佳的一組k和b
def price(x, k, b):
return k*x + b
trying_times = 5000
best_k, best_b = None, None
min_cost = float('inf')
losses = []
for i in range(trying_times):
k = random.random() * 100 - 200
b = random.random() * 100 - 200
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]
#draw_room_and_price()
#plt.scatter(X_rm, price_by_random_k_and_b)
cost = loss(list(y), price_by_random_k_and_b)
if cost < min_cost:
min_cost = cost
best_k, best_b = k, b
print('在第{}, k和b更新了'.format(i))
losses.append(min_cost)
We could add a visualize
min_cost
best_k, best_b
def plot_by_k_and_b(k, b):
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]
draw_room_and_price()
plt.scatter(X_rm, price_by_random_k_and_b)
plot_by_k_and_b(best_k, best_b)
2-nd 方法 進行方向的調整
k的變化有兩種: 增大和減小
b的變化也有兩種:增大和減小
k, b這一組值我們進行變化,就有4種組合:
當,k和b沿着某個方向變化的時候,如何,loss下降了,那麼,k和b接下來就繼續沿着這個方向走,否則,我們就換一個方向
directions = [
(+1, -1),
(+1, +1),
(-1, -1),
(-1, +1)
]
def price(x, k, b):
return k*x + b
trying_times = 10000
best_k = random.random() * 100 - 200
best_b = random.random() * 100 - 200
next_direction = random.choice(directions)
min_cost = float('inf')
losses = []
scala = 0.3
for i in range(trying_times):
current_direction = next_direction
k_direction, b_direction = current_direction
current_k = best_k + k_direction * scala
current_b = best_b + b_direction * scala
price_by_random_k_and_b = [price(r, current_k, current_b) for r in X_rm]
cost = loss(list(y), price_by_random_k_and_b)
if cost < min_cost:
min_cost = cost
best_k, best_b = current_k,current_b
print('在第{}, k和b更新了'.format(i))
losses.append((i, min_cost))
next_direction = current_direction
else:
next_direction = random.choice(list(set(directions) - {current_direction}))
len(losses)
min_cost
3-rd 梯度下降
我們能不能每一次的時候,都按照能夠讓它Loss減小方向走?
都能夠找到一個方向
def partial_k(x, y, y_hat):
gradient = 0
for x_i, y_i, y_hat_i in zip(list(x), list(y), list(y_hat)):
gradient += (y_i - y_hat_i) * x_i
return -2 / len(y) * gradient
def partial_b(y, y_hat):
gradient = 0
for y_i, y_hat_i in zip(list(y), list(y_hat)):
gradient += (y_i - y_hat_i)
return -2 / len(y) * gradient
def price(x, k, b):
# Operation : CNN, RNN, LSTM, Attention 比KX+B更復雜的對應關係
return k*x + b
trying_times = 50000
min_cost = float('inf')
losses = []
scala = 0.3
k, b = random.random() * 100 - 200, random.random() * 100 - 200
參數初始化問題! Weight Initizalition 問題!
best_k, best_b = None, None
learning_rate = 1e-3 # Optimizer Rate
for i in range(trying_times):
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]
cost = loss(list(y), price_by_random_k_and_b)
if cost < min_cost:
# print('在第{}, k和b更新了'.format(i))
min_cost = cost
best_k, best_b = k, b
losses.append((i, min_cost))
k_gradient = partial_k(X_rm, y, price_by_random_k_and_b) # 變化的方向
b_gradient = partial_b(y, price_by_random_k_and_b)
k = min(k + (-1 * k_gradient) * learning_rate, theshold)
# gradient clip
## 優化器: Optimizer
## Adam 動量 momentum
b = b + (-1 * b_gradient) * learning_rate
Batch Normalization
Weight Initization
Activation
封裝成一塊一塊兒的,別人用的時候,不需要重新在開始寫了
len(losses)
print(min_cost)
best_k, best_b
def square(x):
return 10 * x**2 + 5 * x + 5
import numpy as np
_X = np.linspace(-100, 100)
_y = [square(x) for x in _X]
plt.plot(_X, _y)
plot_by_k_and_b(k=best_k, b=best_b)
plot_by_k_and_b(k=best_k, b=best_b)
Min_cost 最終能降低的 50 的樣子~
梯度下降的方法
Min_cost 如果我們想繼續減小,該怎麼辦?
draw_room_and_price()
def sigmoid(x):
return 1 / (1 + np.exp(-x))
test_x = np.linspace(-100, 100, 1000)
plt.plot(sigmoid(test_x))
深度學習裏邊,非常重要的一個理念,不斷使用線性和非線性的函數進行疊加,重複下去就可以產生很複雜的函數
def random_linear(x):
k, b = np.random.normal(), np.random.normal()
return k * x + b
for _ in range(5):
plt.plot(sigmoid(random_linear(test_x)))
for _ in range(15):
plt.plot(sigmoid(random_linear(sigmoid(random_linear(test_x)))))
Kernel 的一個點
深度學習 Deep Leanring
def relu(x):
return x * (x > 0)
for _ in range(15):
plt.plot(sigmoid(random_linear(rule(random_linear(test_x)))))
非線性的函數變化: activation function
Hinton UCSD
def so_many_layers(x, layers):
if len(layers) == 1: return layers[0](x)
return so_many_layers(layers[0](x), layers[1:])
y1 = kx + b
y2 = k2 * y1 + b
y3 = k3 * y2 + b
y4 = k4 * y2 + b
y5 = k5 * y2 + b
y6 = k6 * y2 + b
x => y6 好像是一個很複雜函數!
多層線性函數 並不能擬合出非線性函數
神經網絡裏邊要有 activation function
layers = [random_linear, relu, random_linear, sigmoid, random_linear, relu, sigmoid] * 10
for _ in range(10):
plt.plot(so_many_layers(test_x, layers))
def price(x, k, b):
# Operation : CNN, RNN, LSTM, Attention 比KX+B更復雜的對應關係
return k*x + b
每一次都要寫,能不能把他們寫成一個package 每次用的時候 導入進來呢?
def linear(x, k, b):
return k * x + b
def sigmoid(x): # activation cell
return 1 / (1 + np.exp(-x))
def y(x, k1, k2, b1, b2): # price 對應的更復雜的,非線性的函數
output1 = linear(x, k1, b1)
output2 = sigmoid(output1)
output3 = linear(output2, k2, b2)
return output3
trying_times = 50000
min_cost = float('inf')
losses = []
scala = 0.3
參數初始化問題! Weight Initizalition 問題!
k1, k2 = np.random.normal(), np.random.normal()
b1, b2 = np.random.normal(), np.random.normal()
best_k, best_b = None, None
learning_rate = 1e-3 # Optimizer Rate
for i in range(trying_times):
price_by_random_k_and_b = [y(r, k1, k2, b1, b2) for r in X_rm]
cost = loss(list(y), price_by_random_k_and_b)
k_gradient = partial_k(X_rm, y, price_by_random_k_and_b) # 變化的方向
b_gradient = partial_b(y, price_by_random_k_and_b)
## 求導 很繁瑣
k1_gradient = partial_k1
k2_gradient = partial_k2
b1_gradient = partial_b1
b2_gradient = partial_b2
# 梯度下降
k1 += -1 * k1_gradient * leanring_rate
k2 += -1 * k2_gradient * learning_rate
b1 += -1 * b1_gradient * learning_rate
b2 += -1 * b2_gradient * learning_rate
## neural networks
## 框架 deep
Review:
def linear(x, k, b):
return k * x + b
def sigmoid(x): # activation cell
return 1 / (1 + np.exp(-x))
def y(x, k1, k2, b1, b2): # price 對應的更復雜的,非線性的函數
output1 = linear(x, k1, b1)
output2 = sigmoid(output1)
output3 = linear(output2, k2, b2)
return output3
我們知道Loss的函數形式
那麼,我們在定義Loss的時候,其實我們也知道Loss倒數該怎麼求解
在這個Node的基礎之上
我們寫好很多 Linear, Sigmoid, L2Loss,Relu
打包成一個包
- 建立網絡的鏈接結構
- 把x,k1, b1, k2, b2這些值輸入進來
- 選擇合理的loss
不斷優化,得出k1, b1, k2, b2的值到底應該是多少~
def linear(x, k, b):
return k * x + b
def sigmoid(x): # activation cell
return 1 / (1 + np.exp(-x))
def y(x, k1, k2, b1, b2): # price 對應的更復雜的,非線性的函數
output1 = linear(x, k1, b1)
output2 = sigmoid(output1)
output3 = linear(output2, k2, b2)
return output3
def y(x, k1, k2, b1, b2): # price 對應的更復雜的,非線性的函數
output1 = linear(x, k1, b1)
output2 = sigmoid(output1)
output3 = linear(output2, k2, b2)
value_graph = {
'x': ['linear'],
'k1': ['linear'],
'b1': ['linear'],
'linear': ['sigmoid'],
'sigmoid': ['linear_2'],
'k2': ['linear_2'],
'b2': ['linear_2'],
'linear_2': ['loss']
}
import networkx as nx
graph = nx.DiGraph(value_graph)
layout = nx.layout.spring_layout(graph)
nx.draw(nx.DiGraph(value_graph), layout, with_labels=True)
def visited_procedure(graph, postion, visited_order, step, sub_plot_index=None, colors=('red', 'green')):
changed = visited_order[:step] if step is not None else visited_order
before, after = colors
color_map = [after if c in changed else before for c in graph]
nx.draw(graph, postion, node_color=color_map, with_labels=True, ax=sub_plot_index)
visitor_order = [
'x',
'k1',
'b1',
'k2',
'b2',
'linear',
'sigmoid',
'linear_2',
'loss'
]
Feedward 前導計算 做的事情就是
依據我們的x 和我們的此時此刻的參數集合(k1, k2, b1, b2,…)
計算出我們的預測的值
並且,計算出此時此刻的 Loss
visited_procedure(graph, layout, visitor_order, step=9)
dimension = int(len(visitor_order)**0.5)
fig, ax = plt.subplots(dimension, dimension+1, figsize=(15,15))
for i in range(len(visitor_order) + 1):
ix = np.unravel_index(i, ax.shape)
plt.sca(ax[ix])
ax[ix].title.set_text('Feed Forward Step: {}'.format(i))
visited_procedure(graph, layout, visitor_order, step=i, sub_plot_index=ax[ix])
前向和反向傳播會不斷的進行直到模型參數不更新爲止嗎?
–yes
dimension = int(len(visitor_order)**0.5)
fig, ax = plt.subplots(dimension, dimension+1, figsize=(15,15))
for i in range(len(visitor_order) + 1):
ix = np.unravel_index(i, ax.shape)
plt.sca(ax[ix])
ax[ix].title.set_text('反向傳播Backward Step: {}'.format(i))
visited_procedure(graph, layout, visitor_order[::-1], step=i, sub_plot_index=ax[ix], colors=('green', 'blue'))
#################
def loss(y, y_hat):
sum_ = sum([(y_i - y_hat_i) ** 2 for y_i, y_hat_i in zip(y, y_hat)])
return sum_ / len(y)
def loss_partial_y():
pass
def loss_partial_y_hat():
pass
#############
def linear(x, k, b):
return k * x + b
def linear_partial_x(x, k, b):
return k
def linear_partial_k(x, k, b):
return x
def linear_partial_b(x, k, b):
return 1
########################
def sigmoid(x): # activation cell
return 1 / (1 + np.exp(-x))
def sigmoid_partial(x):
pass
在鏈式求導中,如果某個環節的導數是0, 整個結果都是0,對嗎?
是的! Gradient Vanishing(梯度消失)
class Node:
def __init__(self, inputs=[]):
self.inputs = inputs
self.outputs = []
for n in self.inputs:
n.outputs.append(self)
self.value = None
self.gradients = {}
def forward():
pass
def backward():
pass
class Placeholder(Node):
def __init__(self):
Node.__init__(self)
def forward(self, value=None):
if value is not None: self.value = value
def backward(self):
self.gradients = {}
for n in self.outputs:
self.gradents[self] = n.gradient[self] * 1
class Linear(Node):
def __init__(self, x: None, weigth: None, bias: None):
Node.__init__(self, [x, weigth, bias])
def foward(self):
k, x, b = self.inputs[1], self.inputs[0], self.inputs[2]
self.value = k.value * x.value + b.value
def backward(self):
k, x, b = self.inputs[1], self.inputs[0], self.inputs[2]
for n in self.outputs:
grad_cost = n.gradients[self]
self.gradients[k] = grad_cost * x.value
self.gradients[x] = grad_cost * k.value
self.gradients[b] = grad_cost * 1
class Sigmoid(Node):
def __init__(self, x):
Node.__init__(self, [x])
self.x = self.inputs[0]
def _sigmoid(self, x):
return 1. / (1 + np.exp(-1 * x))
def forward(self):
self.value = self._sigmoid(self.x.value)
def partial(self):
return self._sigmoid(self.x.value) * (1 - self._sigmoid(self.x.value))
def backward(self):
for n in self.outputs:
grad_cost = n.gradients[self]
self.gradients[self.x] = grad_cost * self.partial()
class L2_LOSS(Node):
def __init__(self, y, y_hat):
Node.__init__(self [y, y_hat])
self.y = y
self.y_hat = y_hat
self.y_v, self.yhat_v = np.array(self.y.value), np.array(self.y_hat.value)
def foward(self):
self.value = np.mean((self.y_v - self.yhat_v) ** 2)
def backward(self):
# 1/n sum (y- yhat)**2
self.gradients[self.y] = 2/len(self.y_v) * (self.y_v - self.yhat_v)
self.gradients[self.y_hat] = -2 / len(self.y_v) * (self.y_v - self.yhat_v)
y = [1, 2, 1, 4, 1]
yhat = [3, 2, 1, 4, 5]
y = np.array(y)
yhat = np.array(yhat)
np.mean((y - yhat) ** 2)
X_, y_ = data['data'], data['target']
X_rm = X[:, 5]
def forward_and_backward(graph_order):
# 整體的參數就更新了一次
for node in graph_order:
n.foward()
for node in graph_order[::-1]:
n.backword()
Remain 的東西是什麼呢?
拓撲排序 toplogical
visitor_order
def toplogic(graph):
sorted_node = []
while len(graph) > 0:
all_inputs = []
all_outputs = []
for n in graph:
all_inputs += graph[n]
all_outputs.append(n)
all_inputs = set(all_inputs)
all_outputs = set(all_outputs)
need_remove = all_outputs - all_inputs # which in all_inputs but not in all_outputs
if len(need_remove) > 0:
node = random.choice(list(need_remove))
graph.pop(node)
sorted_node.append(node)
for _, links in graph.items():
if node in links: links.remove(node)
else: # have cycle
break
return sorted_node
value_graph = {
'x': ['linear'],
'k1': ['linear'],
'b1': ['linear'],
'linear': ['sigmoid'],
'sigmoid': ['linear_2'],
'k2': ['linear_2'],
'b2': ['linear_2'],
'linear_2': ['loss']
}
top_order = toplogic(value_graph)
dimension = int(len(visitor_order)**0.5)
fig, ax = plt.subplots(dimension, dimension+1, figsize=(15,15))
for i in range(len(top_order) + 1):
ix = np.unravel_index(i, ax.shape)
plt.sca(ax[ix])
ax[ix].title.set_text('Feed Forward Step: {}'.format(i))
visited_procedure(graph, layout, top_order, step=i, sub_plot_index=ax[ix])
def node_computing_sort():
pass
#from xxxx import Linear, Sigmoid, L2_LOSS, Placeholder
data = load_boston()
X_, y_ = data['data'], data['target']
X_rm = X_[:, 5]
w1_, b1_ = np.random.normal(), np.random.normal()
w2_, b2_ = np.random.normal(), np.random.normal()
X, y = Placeholder(), Placeholder()
w1, b1 = Placeholder(), Placeholder()
w2, b2 = Placeholder(), Placeholder()
build model
output1 = Linear(X, w1, b1)
output2 = Sigmoid(output1)
y_hat = Linear(output2, w2, b2)
cost = L2_LOSS(y, y_hat)
graph_nodes = [output1, output2, y_hat, cost]
feed_dict = {
X: X_rm,
y: y_,
w1: w1_,
w2: w2_,
b1: b1_,
b2: b2_
}
graph_sort = node_computing_sort(feed_dict, graph_nodes)
for e in range(epoch)
forward_and_backward(graph_sort)
下一篇:
- node_computing_sort實現了
- 會增加一個CNN結構,用來實現圖片的分類
- 把這個框架,發佈出去,發佈到pip上,然後呢,你的朋友,你的同學,就可以