在看代碼的時候,別人好的模型一般都是初始化權重的,所以也就先搜索了一些pytorch實現權重初始化的方法,然後自己再做做實驗,看看效果。
先是實現一個簡單的bilstm,隨機生成一些數據試試擬合效果。
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
#雙向lstm
class BILSTM(nn.Module):
def __init__(self):
super(BILSTM,self).__init__()
self.input_size=100
self.batch_first=True
self.hidden=50
self.num_layers=1
#申請lstm
self.lstm=nn.LSTM(self.input_size,self.hidden,self.num_layers,batch_first=self.batch_first,bidirectional=True)
#mlp做個分類器
self.predict_layer_1= nn.Linear(1000, 100)
self.predict_layer_2= nn.Linear(100,2)
#self.init_weight()
#訓練
def init_weight(self):
print(self.input_size)
for name, param in self.lstm.named_parameters():
if 'bias' in name:
nn.init.constant_(param, 0.0)
elif 'weight' in name:
nn.init.xavier_uniform_(param,gain=1)
#運行
def forward(self,x):
outputs,_=self.lstm(x)
outputs=outputs.reshape(1,-1)
out_1=self.predict_layer_1(outputs)
out=self.predict_layer_2(F.relu(out_1))
return F.softmax(out)
#生成數據
tensor_x=torch.randn(10000,requires_grad=True).reshape(10,10,100)
tensor_y=torch.randint(low=0,high=2,size=(10,1))
print(tensor_y)
#聲明函數
bilstm=BILSTM()
lr=0.001
epoch=10
#優化器
optimer= optim.Adam(bilstm.parameters(),lr=lr)
#損失函數
loss_func=torch.nn.CrossEntropyLoss()
tensor=[]
#開始訓練
for i in range(len(tensor_y)) :
tensor.append((tensor_x[i],tensor_y[i]))
loss_plot=[]
for i in range(epoch):
total_loss=0.0
for (x,y) in tensor:
x=x.unsqueeze(0)
outputs=bilstm(x)
loss=loss_func(outputs,y)
total_loss+=loss
optimer.zero_grad()
loss.backward()
optimer.step()
loss_plot.append(total_loss)
print(total_loss)
輸出如下:
0
tensor(7.0073, grad_fn=<AddBackward0>)
1
tensor(5.3326, grad_fn=<AddBackward0>)
2
tensor(4.1300, grad_fn=<AddBackward0>)
3
tensor(3.4265, grad_fn=<AddBackward0>)
4
tensor(3.2202, grad_fn=<AddBackward0>)
5
tensor(3.1664, grad_fn=<AddBackward0>)
6
tensor(3.1497, grad_fn=<AddBackward0>)
7
tensor(3.1435, grad_fn=<AddBackward0>)
8
tensor(3.1406, grad_fn=<AddBackward0>)
9
tensor(3.1390, grad_fn=<AddBackward0>)
10
tensor(3.1380, grad_fn=<AddBackward0>)
如何我們調用init_weight(),初始化lstm的權重
bilstm.init_weight()
可以得到
0
tensor(6.9224, grad_fn=<AddBackward0>)
1
tensor(5.2937, grad_fn=<AddBackward0>)
2
tensor(4.1650, grad_fn=<AddBackward0>)
3
tensor(3.4500, grad_fn=<AddBackward0>)
4
tensor(3.2249, grad_fn=<AddBackward0>)
5
tensor(3.1677, grad_fn=<AddBackward0>)
6
tensor(3.1505, grad_fn=<AddBackward0>)
7
tensor(3.1440, grad_fn=<AddBackward0>)
8
tensor(3.1410, grad_fn=<AddBackward0>)
9
tensor(3.1393, grad_fn=<AddBackward0>)
10
tensor(3.1383, grad_fn=<AddBackward0>)
似乎也沒什麼區別呀!
如果將初始化的區間放在(-0.25,0.25)之間的話,
可得到
0
tensor(6.8440, grad_fn=<AddBackward0>)
1
tensor(5.9343, grad_fn=<AddBackward0>)
2
tensor(4.5523, grad_fn=<AddBackward0>)
3
tensor(3.5413, grad_fn=<AddBackward0>)
4
tensor(3.2001, grad_fn=<AddBackward0>)
5
tensor(3.1418, grad_fn=<AddBackward0>)
6
tensor(3.1353, grad_fn=<AddBackward0>)
7
tensor(3.1340, grad_fn=<AddBackward0>)
8
tensor(3.1337, grad_fn=<AddBackward0>)
9
tensor(3.1335, grad_fn=<AddBackward0>)
10
tensor(3.1334, grad_fn=<AddBackward0>)
實驗效果,真的有所上升。所以這個初始化並不是說你自己設定一個值進行初始化就可以了。而是需要借鑑別人是如何設置的。
然後修改一下init_weight,將mlp也初始化權重
def init_weight(self):
print(self.input_size)
for name, param in self.lstm.named_parameters():
if 'bias' in name:
nn.init.constant_(param, 0.0)
elif 'weight' in name:
nn.init.xavier_uniform_(param,gain=0.25)
nn.init.xavier_uniform_(self.predict_layer_1.weight.data,gain=0.25)
nn.init.xavier_uniform_(self.predict_layer_2.weight.data,gain=0.25)
輸出爲:
tensor(6.8529, grad_fn=<AddBackward0>)
1
tensor(6.3459, grad_fn=<AddBackward0>)
2
tensor(5.1105, grad_fn=<AddBackward0>)
3
tensor(3.7576, grad_fn=<AddBackward0>)
4
tensor(3.3176, grad_fn=<AddBackward0>)
5
tensor(3.1597, grad_fn=<AddBackward0>)
6
tensor(3.1368, grad_fn=<AddBackward0>)
7
tensor(3.1340, grad_fn=<AddBackward0>)
8
tensor(3.1334, grad_fn=<AddBackward0>)
9
tensor(3.1332, grad_fn=<AddBackward0>)
10
tensor(3.1331, grad_fn=<AddBackward0>)
從結果來看,雖然提高並不是很大,但是依舊還是有效果上的提升。
所以說如果你想設計一個模型,一定不要忘了對模型的每個w都初始化一下。
我們再來看看pytorch各種初始化函數的實際效果。
首先是隨機初始化。
bilstm=BILSTM()
for name,parm in bilstm.lstm.named_parameters():
if 'weight' in name:
print(parm.size())
w = torch.Tensor(3,4)
print (w)
輸出一下lstm的參數,以及從另一篇博客裏面看來的隨機初始化之後的值。
Parameter containing:
tensor([[ 0.0672, -0.1371, -0.1015, ..., -0.0548, -0.0286, -0.1193],
[-0.0230, -0.0572, -0.0824, ..., 0.1388, -0.1184, -0.0941],
[-0.0675, 0.0214, 0.1078, ..., -0.0245, -0.0825, 0.1242],
...,
[-0.0446, -0.0175, -0.1203, ..., 0.0902, -0.0098, 0.1057],
[ 0.1046, 0.0263, 0.0200, ..., 0.0648, -0.0403, -0.0828],
[-0.0945, 0.0827, 0.0854, ..., -0.0612, 0.1071, -0.0015]],
requires_grad=True)
Parameter containing:
tensor([[-0.1040, 0.1334, -0.0364, ..., 0.0834, 0.1374, 0.0031],
[ 0.0660, -0.0556, -0.0761, ..., -0.0565, -0.0226, -0.0068],
[-0.0622, -0.0794, -0.1130, ..., 0.0435, -0.1034, 0.0127],
...,
[ 0.0165, -0.0373, -0.1080, ..., -0.0270, 0.0420, -0.0621],
[-0.1387, 0.0976, -0.0911, ..., 0.0946, -0.0685, 0.0816],
[ 0.0780, -0.0704, -0.0675, ..., 0.0620, -0.0110, 0.0602]],
requires_grad=True)
Parameter containing:
tensor([[ 2.2537e-03, 9.6619e-02, 7.7635e-05, ..., -9.8272e-02,
9.2248e-02, -3.9105e-02],
[ 5.5187e-03, -8.7172e-02, 1.2426e-01, ..., 9.6126e-02,
1.8508e-02, 4.2137e-02],
[-3.2823e-02, -3.1873e-02, 5.6963e-02, ..., 5.3042e-02,
1.1010e-01, -1.2473e-01],
...,
[-9.8956e-02, -4.5421e-02, 3.1055e-02, ..., -9.8684e-02,
-9.3469e-03, -1.2118e-01],
[-7.6992e-02, -8.7227e-02, -3.6597e-02, ..., 2.6205e-02,
-1.1190e-02, -6.1281e-02],
[-6.5006e-02, 8.0247e-02, 8.5139e-02, ..., -1.0417e-02,
8.8553e-02, -3.6364e-02]], requires_grad=True)
Parameter containing:
tensor([[-0.0591, 0.0769, -0.0983, ..., 0.0991, 0.0039, 0.0317],
[ 0.0646, -0.0065, -0.0473, ..., 0.0223, -0.0831, 0.1218],
[-0.0016, -0.0657, -0.1060, ..., -0.0328, -0.0774, -0.0739],
...,
[-0.1156, 0.0403, 0.1401, ..., 0.1112, 0.0567, 0.1378],
[ 0.1080, -0.0291, 0.1341, ..., -0.0532, -0.0484, 0.0910],
[-0.1250, 0.0434, -0.1003, ..., 0.0774, 0.0956, 0.0753]],
requires_grad=True)
tensor([[2.3694e-38, 2.3694e-38, 2.3694e-38, 2.3694e-38],
[2.3694e-38, 2.3694e-38, 2.3694e-38, 2.3694e-38],
[2.3694e-38, 0.0000e+00, 0.0000e+00, 0.0000e+00]])
以及他們的size()
torch.Size([200, 100])
torch.Size([200, 50])
torch.Size([200, 100])
torch.Size([200, 50])
tensor([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
由上可得,模型的自動初始化的值其實很沒有特別的離譜,至少沒有這篇博客裏面說的那麼離譜。
Pytorch權重初始化
此外對於這個torch.Tensor的輸出也有些迷啊!如果前面有lstm的權重輸出之後,值的範圍就會變得特別大, 但是如果只是輸出他們size(),或者單純的調用Tensor生成,之後得到的值都是零,這個一個bug,還是別的什麼,請大佬們爲我解惑。
接下來是單純的使用nn.init中的初始化函數之後得到的值。
w = torch.Tensor(3,4)
print (w)
w_uniform=nn.init.xavier_uniform(w)
print(w_uniform)
w_uniform=nn.init.xavier_uniform(w,gain=0.25)
print(w_uniform)
w_normal=nn.init.xavier_normal(w,gain=0.25)
print(w_normal)
w_norm=nn.init.normal(w, mean=0, std=1)
print(w_norm)
bias=torch.Tensor(3,4)
print(bias)
b_con=nn.init.constant(bias,0.2)
print(b_con)
輸出如下:
w: tensor([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
w_uniform: tensor([[-0.1295, -0.8957, 0.8389, -0.3132],
[ 0.0608, 0.2215, -0.9177, -0.5083],
[ 0.2126, 0.2032, -0.1209, 0.8002]])
w_uniform tensor([[ 0.0659, 0.0820, -0.1882, -0.0439],
[-0.0190, -0.2264, 0.1149, 0.0834],
[ 0.2150, -0.0422, 0.1726, 0.0316]])
w_normal tensor([[-0.0201, 0.2945, 0.0272, -0.2300],
[ 0.0718, 0.0022, 0.0796, -0.0851],
[ 0.0554, 0.0835, -0.0653, -0.2231]])
w_norm tensor([[ 0.7104, -2.0925, 1.1471, -0.0806],
[ 0.6488, 1.7645, -0.9153, 0.5130],
[ 0.6747, -0.5394, 0.0418, 1.4629]])
bias: tensor([[ 0.7104, -2.0925, 1.1471, -0.0806],
[ 0.6488, 1.7645, -0.9153, 0.5130],
[ 0.6747, -0.5394, 0.0418, 1.4629]])
b_con tensor([[0.2000, 0.2000, 0.2000, 0.2000],
[0.2000, 0.2000, 0.2000, 0.2000],
[0.2000, 0.2000, 0.2000, 0.2000]])
xavier_uniform()默認的範圍是-1到1,norm符合正態分佈,然後同樣的都是使用的tensor(size()),但是w和 bias的值確明顯不一樣。這是bug嗎?