pytorch構建模型的初始化問題

在看代碼的時候,別人好的模型一般都是初始化權重的,所以也就先搜索了一些pytorch實現權重初始化的方法,然後自己再做做實驗,看看效果。
先是實現一個簡單的bilstm,隨機生成一些數據試試擬合效果。

import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
#雙向lstm
class BILSTM(nn.Module):
    def __init__(self):
        super(BILSTM,self).__init__()
        self.input_size=100
        self.batch_first=True
        self.hidden=50
        self.num_layers=1
        #申請lstm
        self.lstm=nn.LSTM(self.input_size,self.hidden,self.num_layers,batch_first=self.batch_first,bidirectional=True)
        #mlp做個分類器
        self.predict_layer_1= nn.Linear(1000, 100)
        self.predict_layer_2= nn.Linear(100,2)


        #self.init_weight()
     #訓練
    def init_weight(self):
        print(self.input_size)
        for name, param in self.lstm.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param,gain=1)
     #運行
    def forward(self,x):
        outputs,_=self.lstm(x)
        outputs=outputs.reshape(1,-1)
        out_1=self.predict_layer_1(outputs)
        out=self.predict_layer_2(F.relu(out_1))
        return F.softmax(out)
#生成數據
tensor_x=torch.randn(10000,requires_grad=True).reshape(10,10,100)
tensor_y=torch.randint(low=0,high=2,size=(10,1))
print(tensor_y)
#聲明函數
bilstm=BILSTM()
lr=0.001
epoch=10
#優化器
optimer= optim.Adam(bilstm.parameters(),lr=lr)
#損失函數
loss_func=torch.nn.CrossEntropyLoss()
tensor=[]
#開始訓練
for i in range(len(tensor_y)) :
    tensor.append((tensor_x[i],tensor_y[i]))
loss_plot=[]
for i in range(epoch):
    total_loss=0.0
    for (x,y) in tensor:
        x=x.unsqueeze(0)

        outputs=bilstm(x)
        loss=loss_func(outputs,y)
        total_loss+=loss
        optimer.zero_grad()
        loss.backward()
        optimer.step()
    loss_plot.append(total_loss)
    print(total_loss)

輸出如下:

0
tensor(7.0073, grad_fn=<AddBackward0>)
1
tensor(5.3326, grad_fn=<AddBackward0>)
2
tensor(4.1300, grad_fn=<AddBackward0>)
3
tensor(3.4265, grad_fn=<AddBackward0>)
4
tensor(3.2202, grad_fn=<AddBackward0>)
5
tensor(3.1664, grad_fn=<AddBackward0>)
6
tensor(3.1497, grad_fn=<AddBackward0>)
7
tensor(3.1435, grad_fn=<AddBackward0>)
8
tensor(3.1406, grad_fn=<AddBackward0>)
9
tensor(3.1390, grad_fn=<AddBackward0>)
10
tensor(3.1380, grad_fn=<AddBackward0>)

如何我們調用init_weight(),初始化lstm的權重

bilstm.init_weight()

可以得到

0
tensor(6.9224, grad_fn=<AddBackward0>)
1
tensor(5.2937, grad_fn=<AddBackward0>)
2
tensor(4.1650, grad_fn=<AddBackward0>)
3
tensor(3.4500, grad_fn=<AddBackward0>)
4
tensor(3.2249, grad_fn=<AddBackward0>)
5
tensor(3.1677, grad_fn=<AddBackward0>)
6
tensor(3.1505, grad_fn=<AddBackward0>)
7
tensor(3.1440, grad_fn=<AddBackward0>)
8
tensor(3.1410, grad_fn=<AddBackward0>)
9
tensor(3.1393, grad_fn=<AddBackward0>)
10
tensor(3.1383, grad_fn=<AddBackward0>)

似乎也沒什麼區別呀!
如果將初始化的區間放在(-0.25,0.25)之間的話,
可得到

0
tensor(6.8440, grad_fn=<AddBackward0>)
1
tensor(5.9343, grad_fn=<AddBackward0>)
2
tensor(4.5523, grad_fn=<AddBackward0>)
3
tensor(3.5413, grad_fn=<AddBackward0>)
4
tensor(3.2001, grad_fn=<AddBackward0>)
5
tensor(3.1418, grad_fn=<AddBackward0>)
6
tensor(3.1353, grad_fn=<AddBackward0>)
7
tensor(3.1340, grad_fn=<AddBackward0>)
8
tensor(3.1337, grad_fn=<AddBackward0>)
9
tensor(3.1335, grad_fn=<AddBackward0>)
10
tensor(3.1334, grad_fn=<AddBackward0>)

實驗效果,真的有所上升。所以這個初始化並不是說你自己設定一個值進行初始化就可以了。而是需要借鑑別人是如何設置的。
然後修改一下init_weight,將mlp也初始化權重

 def init_weight(self):
        print(self.input_size)
        for name, param in self.lstm.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param,gain=0.25)
        nn.init.xavier_uniform_(self.predict_layer_1.weight.data,gain=0.25)
        nn.init.xavier_uniform_(self.predict_layer_2.weight.data,gain=0.25)

輸出爲:

tensor(6.8529, grad_fn=<AddBackward0>)
1
tensor(6.3459, grad_fn=<AddBackward0>)
2
tensor(5.1105, grad_fn=<AddBackward0>)
3
tensor(3.7576, grad_fn=<AddBackward0>)
4
tensor(3.3176, grad_fn=<AddBackward0>)
5
tensor(3.1597, grad_fn=<AddBackward0>)
6
tensor(3.1368, grad_fn=<AddBackward0>)
7
tensor(3.1340, grad_fn=<AddBackward0>)
8
tensor(3.1334, grad_fn=<AddBackward0>)
9
tensor(3.1332, grad_fn=<AddBackward0>)
10
tensor(3.1331, grad_fn=<AddBackward0>)

從結果來看,雖然提高並不是很大,但是依舊還是有效果上的提升。
所以說如果你想設計一個模型,一定不要忘了對模型的每個w都初始化一下。

我們再來看看pytorch各種初始化函數的實際效果。
首先是隨機初始化。

bilstm=BILSTM()
for name,parm in bilstm.lstm.named_parameters():
    if 'weight' in name:
        print(parm.size())


w = torch.Tensor(3,4)
print (w)

輸出一下lstm的參數,以及從另一篇博客裏面看來的隨機初始化之後的值。

Parameter containing:
tensor([[ 0.0672, -0.1371, -0.1015,  ..., -0.0548, -0.0286, -0.1193],
        [-0.0230, -0.0572, -0.0824,  ...,  0.1388, -0.1184, -0.0941],
        [-0.0675,  0.0214,  0.1078,  ..., -0.0245, -0.0825,  0.1242],
        ...,
        [-0.0446, -0.0175, -0.1203,  ...,  0.0902, -0.0098,  0.1057],
        [ 0.1046,  0.0263,  0.0200,  ...,  0.0648, -0.0403, -0.0828],
        [-0.0945,  0.0827,  0.0854,  ..., -0.0612,  0.1071, -0.0015]],
       requires_grad=True)
Parameter containing:
tensor([[-0.1040,  0.1334, -0.0364,  ...,  0.0834,  0.1374,  0.0031],
        [ 0.0660, -0.0556, -0.0761,  ..., -0.0565, -0.0226, -0.0068],
        [-0.0622, -0.0794, -0.1130,  ...,  0.0435, -0.1034,  0.0127],
        ...,
        [ 0.0165, -0.0373, -0.1080,  ..., -0.0270,  0.0420, -0.0621],
        [-0.1387,  0.0976, -0.0911,  ...,  0.0946, -0.0685,  0.0816],
        [ 0.0780, -0.0704, -0.0675,  ...,  0.0620, -0.0110,  0.0602]],
       requires_grad=True)
Parameter containing:
tensor([[ 2.2537e-03,  9.6619e-02,  7.7635e-05,  ..., -9.8272e-02,
          9.2248e-02, -3.9105e-02],
        [ 5.5187e-03, -8.7172e-02,  1.2426e-01,  ...,  9.6126e-02,
          1.8508e-02,  4.2137e-02],
        [-3.2823e-02, -3.1873e-02,  5.6963e-02,  ...,  5.3042e-02,
          1.1010e-01, -1.2473e-01],
        ...,
        [-9.8956e-02, -4.5421e-02,  3.1055e-02,  ..., -9.8684e-02,
         -9.3469e-03, -1.2118e-01],
        [-7.6992e-02, -8.7227e-02, -3.6597e-02,  ...,  2.6205e-02,
         -1.1190e-02, -6.1281e-02],
        [-6.5006e-02,  8.0247e-02,  8.5139e-02,  ..., -1.0417e-02,
          8.8553e-02, -3.6364e-02]], requires_grad=True)
Parameter containing:
tensor([[-0.0591,  0.0769, -0.0983,  ...,  0.0991,  0.0039,  0.0317],
        [ 0.0646, -0.0065, -0.0473,  ...,  0.0223, -0.0831,  0.1218],
        [-0.0016, -0.0657, -0.1060,  ..., -0.0328, -0.0774, -0.0739],
        ...,
        [-0.1156,  0.0403,  0.1401,  ...,  0.1112,  0.0567,  0.1378],
        [ 0.1080, -0.0291,  0.1341,  ..., -0.0532, -0.0484,  0.0910],
        [-0.1250,  0.0434, -0.1003,  ...,  0.0774,  0.0956,  0.0753]],
       requires_grad=True)
tensor([[2.3694e-38, 2.3694e-38, 2.3694e-38, 2.3694e-38],
        [2.3694e-38, 2.3694e-38, 2.3694e-38, 2.3694e-38],
        [2.3694e-38, 0.0000e+00, 0.0000e+00, 0.0000e+00]])

以及他們的size()

torch.Size([200, 100])
torch.Size([200, 50])
torch.Size([200, 100])
torch.Size([200, 50])
tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

由上可得,模型的自動初始化的值其實很沒有特別的離譜,至少沒有這篇博客裏面說的那麼離譜。
Pytorch權重初始化
此外對於這個torch.Tensor的輸出也有些迷啊!如果前面有lstm的權重輸出之後,值的範圍就會變得特別大, 但是如果只是輸出他們size(),或者單純的調用Tensor生成,之後得到的值都是零,這個一個bug,還是別的什麼,請大佬們爲我解惑。

接下來是單純的使用nn.init中的初始化函數之後得到的值。

w = torch.Tensor(3,4)
print (w)
w_uniform=nn.init.xavier_uniform(w)
print(w_uniform)
w_uniform=nn.init.xavier_uniform(w,gain=0.25)
print(w_uniform)
w_normal=nn.init.xavier_normal(w,gain=0.25)
print(w_normal)
w_norm=nn.init.normal(w, mean=0, std=1)
print(w_norm)
bias=torch.Tensor(3,4)
print(bias)
b_con=nn.init.constant(bias,0.2)
print(b_con)

輸出如下:

w: tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])
w_uniform: tensor([[-0.1295, -0.8957,  0.8389, -0.3132],
        [ 0.0608,  0.2215, -0.9177, -0.5083],
        [ 0.2126,  0.2032, -0.1209,  0.8002]])

w_uniform tensor([[ 0.0659,  0.0820, -0.1882, -0.0439],
        [-0.0190, -0.2264,  0.1149,  0.0834],
        [ 0.2150, -0.0422,  0.1726,  0.0316]])

w_normal tensor([[-0.0201,  0.2945,  0.0272, -0.2300],
        [ 0.0718,  0.0022,  0.0796, -0.0851],
        [ 0.0554,  0.0835, -0.0653, -0.2231]])
w_norm tensor([[ 0.7104, -2.0925,  1.1471, -0.0806],
        [ 0.6488,  1.7645, -0.9153,  0.5130],
        [ 0.6747, -0.5394,  0.0418,  1.4629]])
bias: tensor([[ 0.7104, -2.0925,  1.1471, -0.0806],
        [ 0.6488,  1.7645, -0.9153,  0.5130],
        [ 0.6747, -0.5394,  0.0418,  1.4629]])
b_con tensor([[0.2000, 0.2000, 0.2000, 0.2000],
        [0.2000, 0.2000, 0.2000, 0.2000],
        [0.2000, 0.2000, 0.2000, 0.2000]])

xavier_uniform()默認的範圍是-1到1,norm符合正態分佈,然後同樣的都是使用的tensor(size()),但是w和 bias的值確明顯不一樣。這是bug嗎?

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章