Pytorch【60天修煉計劃】之第一階段——入門:softmax迴歸的實現

DAY 5


3.6 SOFTMAX迴歸的從零開始實現

import torch
import torchvision
import numpy as np

3.6.1 獲取和讀取數據

使用Fashion-MNIST數據集,設置batch大小爲256

import torchvision.transforms as transforms
batch_size = 256
mnist_train = torchvision.datasets.FashionMNIST(root='./data', 
                                                train=True, download=True, transform=transforms.ToTensor())
mnist_test = torchvision.datasets.FashionMNIST(root='./data', 
                                               train=False, download=True, transform=transforms.ToTensor())
train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, 
                                         shuffle=True, num_workers=4)
test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, 
                                        shuffle=False, num_workers=4)

3.6.2 初始化模型參數

跟線性迴歸中的例⼦⼀樣,我們將使⽤向量表示每個樣本。
已知每個樣本輸⼊是⾼和寬均爲28像素的圖像。模型的輸⼊向量的⻓度是28 × 28 = 784:該向量的每個元素對應圖像中每個像素。由於圖像有 10個類別,單層神經⽹絡輸出層的輸出個數爲10,因此softmax迴歸的權重和偏差參數分別爲784 × 101 × 10的矩陣。

num_inputs = 784
num_outputs = 10

W = torch.tensor(np.random.normal(0, 0.01, (num_inputs, num_outputs)), dtype = torch.float)
b = torch.zeros(num_outputs, dtype = torch.float)

# 參數設置爲梯度
W.requires_grad_(requires_grad = True)
b.requires_grad_(requires_grad = True)
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)

3.6.3 實現SOFTMAX運算

softmax的公式爲
yk=exp(xk)i=1mexp(xi),k=1,...,my_k = \frac{exp(x_k)}{\sum_{i=1}^m{exp(x_i)}}, k = 1,...,m
其中 i=1myi=1\sum_{i=1}^m{y_i} = 1

# softmax
def softmax(X):
    X_exp = X.exp()
    # dim = 0 對同一列的元素求和, dim = 1 對同一行元素求和
    partition = X_exp.sum(dim = 1, keepdim = True)
    return X_exp / partition # broadcast
    
# 當輸入爲隨機數時,我們將每個元素都變成了非負數,且每一行的和爲1
X = torch.rand((2, 5))
X_prob = softmax(X)
print(X_prob, X_prob.sum(dim= 1))
tensor([[0.2833, 0.1352, 0.2536, 0.1701, 0.1577],
        [0.2230, 0.1497, 0.2517, 0.1424, 0.2332]]) tensor([1.0000, 1.0000])

3.6.4 定義模型

def net(X):
    return softmax(torch.mm(X.view(-1, num_inputs), W) + b)

3.6.5 定義損失函數

softmax運算使用的是交叉熵損失函數:
H(y(i),y^(i))=j=1qyj(i)logy^j(i)H(y^{(i)}, \hat{y}^{(i)}) = -\sum_{j=1}^qy_j^{(i)}log\hat{y}_j^{(i)}

若訓練數據集的樣本數爲n,則交叉熵損失函數定義爲
ζ(Θ)=1ni=1nH(y(i),y^(i))\zeta(\Theta) = \frac{1}{n}\sum_{i=1}^nH(y^{(i)}, \hat{y}^{(i)})

爲了得到標籤的預測概率,我們可以使⽤ gather 函數

def cross_entropy(y_hat, y):
    return - torch.log(y_hat.gather(1, y.view(-1, 1)))

3.6.6 計算分類準確率

給定⼀個類別的預測概率分佈 y_hat ,我們把預測概率最⼤的類別作爲輸出類別。如果它與真實類別 y ⼀致,說明這次預測是正確的。分類準確率即正確預測數量與總預測數量之⽐。

下⾯定義準確率 accuracy 函數。其中 y_hat.argmax(dim=1) 返回矩陣 y_hat 每 ⾏ 中 最 ⼤ 元 素 的 索 引 , 且 返 回 結 果 與 變 量 y 形 狀 相 同 。 相 等 條 件 判 斷式 (y_hat.argmax(dim=1) == y) 是⼀個類型爲 ByteTensor 的 Tensor ,我們⽤ float() 將其轉換爲值爲 0(相等爲假)或 1(相等爲真)的浮點型 Tensor 。

def arruracy(y_hat, y):
    return (y_hat.argmax(dim = 1) == y).float().mean().item()
# 定義一個評價net在數據集上的準確率
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        acc_sum += (net(X).argmax(dim = 1) == y).float().sum().item()
        n += y.shape[0]
    return acc_sum / n

3.6.7 訓練模型

使⽤⼩批量隨機梯度下降來優化模型的損失函數。在訓練模型時,迭代週期數 num_epochs 和學習率 lr 都是可以調的超參數

num_epochs, lr = 5, 0.1

def train(net, train_iter, test_iter, loss, num_epochs, batch_size, 
          params = None, lr = None, optimizer = None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            y_hat = net(X)
            l = loss(y_hat, y).sum()
            
            # 梯度清零
            if optimizer is not None:
                optimizer.zero_grad()
            elif params is not None and params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            
            l.backward()
            if optimizer is None:
                for param in params:
                    param.data -= lr * param.grad / batch_size # 注意這⾥更改param
            else:
                optimizer.step()
            
            train_l_sum += l.item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
            n += y.shape[0]
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' 
              %(epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
train(net, train_iter, test_iter, cross_entropy, num_epochs, batch_size, [W, b], lr)
epoch 1, loss 0.7861, train acc 0.750, test acc 0.793
epoch 2, loss 0.5715, train acc 0.812, test acc 0.810
epoch 3, loss 0.5263, train acc 0.826, test acc 0.818
epoch 4, loss 0.5008, train acc 0.833, test acc 0.824
epoch 5, loss 0.4842, train acc 0.837, test acc 0.827

3.7 SOFTMAX迴歸的簡潔實現

import torch.nn as nn
class LinearNet(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(LinearNet, self).__init__()
        self.linear = nn.Linear(num_inputs, num_outputs)
    
    def forward(self, x): # x shape: (batch_size, 1, 28, 28),則用view()將x的形狀轉換爲(batch_size, 784)
        y = self.linear(x.view(x.shape[0], -1))
        return y
    
net = LinearNet(num_inputs, num_outputs)
# 對 x 的形狀轉換的這個功能⾃定義⼀個 `FlattenLayer`類
class FlattenLayer(nn.Module):
    def __init__(self):
        super(FlattenLayer, self).__init__()
    def forward(self, x):
        return x.view(x.shape[0], -1)

# 定義這個模型
from collections import OrderedDict
net = nn.Sequential(
    OrderedDict([
        ('flatten', FlattenLayer()),
        ('linear', nn.Linear(num_inputs, num_outputs))
    ])
)
# 我們使⽤均值爲0、標準差爲0.01的正態分佈隨機初始化模型的權重參數
from torch.nn import init
init.normal_(net.linear.weight, mean = 0, std = 0.01)
init.constant_(net.linear.bias, val = 0)
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)
# 定義損失函數和優化器
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr = 0.1)
# 訓練模型
num_epochs = 5
train(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, optimizer)
epoch 1, loss 0.0031, train acc 0.748, test acc 0.788
epoch 2, loss 0.0022, train acc 0.813, test acc 0.813
epoch 3, loss 0.0021, train acc 0.826, test acc 0.813
epoch 4, loss 0.0020, train acc 0.832, test acc 0.821
epoch 5, loss 0.0019, train acc 0.836, test acc 0.826

3.8 多層感知機

前面講了單層神經網絡,而深度學習中神經網絡往往有很多層,所以接下來開始進行多層神經網絡的實現

H=ϕ(XWh+bh),H = \phi(XW_h + b_h),
O=HWo+boO = HW_o + b_o

其中ϕ\phi爲激活函數。常用的激活函數有Relu, sigmoid, tanh。
在分類問題中,我們可以對輸出OO做softmax運算,並使⽤softmax迴歸中的交叉熵損失函數。在迴歸問題中,我們將輸出層的輸出個數設爲1,並將輸出OO直接提供給線性迴歸中使⽤的平⽅損失函數

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision.transforms as transforms

batch_size = 256
mnist_train = torchvision.datasets.FashionMNIST(root='./data', 
                                                train=True, download=True, transform=transforms.ToTensor())
mnist_test = torchvision.datasets.FashionMNIST(root='./data', 
                                               train=False, download=True, transform=transforms.ToTensor())
train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, 
                                         shuffle=True, num_workers=4)
test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, 
                                        shuffle=False, num_workers=4)

3.8.1 定義模型參數

設置超參數隱藏單元個數爲256

num_inputs, num_outputs, num_hiddens = 784, 10, 256

# 對 x 的形狀轉換的這個功能⾃定義⼀個 `FlattenLayer`類
class FlattenLayer(nn.Module):
    def __init__(self):
        super(FlattenLayer, self).__init__()
    def forward(self, x):
        return x.view(x.shape[0], -1)
    
net = nn.Sequential(
    FlattenLayer(),
    nn.Linear(num_inputs, num_hiddens),
    nn.ReLU(),
    nn.Linear(num_hiddens, num_outputs),
)

from torch.nn import init

for params in net.parameters():
    init.normal_(params, mean = 0, std = 0.01)

3.8.2 訓練函數

# 定義一個評價net在數據集上的準確率
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        acc_sum += (net(X).argmax(dim = 1) == y).float().sum().item()
        n += y.shape[0]
    return acc_sum / n

def train(net, train_iter, test_iter, loss, num_epochs, batch_size, 
          params = None, lr = None, optimizer = None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            y_hat = net(X)
            l = loss(y_hat, y).sum()
            
            # 梯度清零
            if optimizer is not None:
                optimizer.zero_grad()
            elif params is not None and params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            
            l.backward()
            if optimizer is None:
                for param in params:
                    param.data -= lr * param.grad / batch_size # 注意這⾥更改param
            else:
                optimizer.step()
            
            train_l_sum += l.item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
            n += y.shape[0]
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f' 
              %(epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))

3.8.3 讀取數據並訓練數據

batch_size = 256
loss = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr = 0.5)

num_epochs = 5
train(net, train_iter, test_iter, loss, num_epochs,batch_size, None, None, optimizer)
epoch 1, loss 0.0019, train acc 0.823, test acc 0.815
epoch 2, loss 0.0017, train acc 0.841, test acc 0.829
epoch 3, loss 0.0015, train acc 0.856, test acc 0.844
epoch 4, loss 0.0015, train acc 0.863, test acc 0.806
epoch 5, loss 0.0014, train acc 0.868, test acc 0.859

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章