1- layers.py中dropout部分實現
def dropout_forward(x, dropout_param):
"""
Performs the forward pass for (inverted) dropout.
Inputs:
- x: Input data, of any shape
- dropout_param: A dictionary with the following keys:
- p: Dropout parameter. We keep each neuron output with probability p.
- mode: 'test' or 'train'. If the mode is train, then perform dropout;
if the mode is test, then just return the input.
- seed: Seed for the random number generator. Passing seed makes this
function deterministic, which is needed for gradient checking but not
in real networks.
Outputs:
- out: Array of the same shape as x.
- cache: tuple (dropout_param, mask). In training mode, mask is the dropout
mask that was used to multiply the input; in test mode, mask is None.
NOTE: Please implement **inverted** dropout, not the vanilla version of dropout.
See http://cs231n.github.io/neural-networks-2/#reg for more details.
NOTE 2: Keep in mind that p is the probability of **keep** a neuron
output; this might be contrary to some sources, where it is referred to
as the probability of dropping a neuron output.
"""
p, mode = dropout_param['p'], dropout_param['mode']
if 'seed' in dropout_param:
np.random.seed(dropout_param['seed'])
mask = None
out = None
if mode == 'train':
#######################################################################
# TODO: Implement training phase forward pass for inverted dropout. #
# Store the dropout mask in the mask variable. #
#######################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
mask = (np.random.rand(*x.shape) < p) / p
out = x * mask
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
#######################################################################
# END OF YOUR CODE #
#######################################################################
elif mode == 'test':
#######################################################################
# TODO: Implement the test phase forward pass for inverted dropout. #
#######################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
out = x
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
#######################################################################
# END OF YOUR CODE #
#######################################################################
cache = (dropout_param, mask)
out = out.astype(x.dtype, copy=False)
return out, cache
def dropout_backward(dout, cache):
"""
Perform the backward pass for (inverted) dropout.
Inputs:
- dout: Upstream derivatives, of any shape
- cache: (dropout_param, mask) from dropout_forward.
"""
dropout_param, mask = cache
mode = dropout_param['mode']
dx = None
if mode == 'train':
#######################################################################
# TODO: Implement training phase backward pass for inverted dropout #
#######################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
dx = dout * mask
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
#######################################################################
# END OF YOUR CODE #
#######################################################################
elif mode == 'test':
dx = dout
return dx
2- layer_utils.py加入的幾個dropout層函數
def affine_bn_relu_dropout_forward(x, w, b, gamma, beta, bn_param, dropout_param):
"""
affine-bn-relu-dropout layer
"""
a, fc_cache = affine_forward(x, w, b)
bn, bn_cache = batchnorm_forward(a, gamma, beta, bn_param)
relu, relu_cache = relu_forward(bn)
out, dropout_cache = dropout_forward(relu, dropout_param)
cache = (fc_cache, bn_cache, relu_cache, dropout_cache)
return out, cache
def affine_bn_relu_dropout_backward(dout, cache):
"""
affine-bn-relu-dropout backward layer
"""
fc_cache, bn_cache, relu_cache, dropout_cache = cache
ddropout = dropout_backward(dout, dropout_cache)
drelu = relu_backward(ddropout, relu_cache)
dbn, dgamma, dbeta = batchnorm_backward(drelu, bn_cache)
dx, dw, db = affine_backward(dbn, fc_cache)
return dx, dw, db, dgamma, dbeta
def affine_ln_relu_dropout_forward(x, w, b, gamma, beta, ln_param, dropout_param):
"""
affine-ln-relu-dropout layer
"""
a, fc_cache = affine_forward(x, w, b)
ln, ln_cache = layernorm_forward(a, gamma, beta, ln_param)
relu, relu_cache = relu_forward(ln)
out, dropout_cache = dropout_forward(relu, dropout_param)
cache = (fc_cache, ln_cache, relu_cache, dropout_cache)
return out, cache
def affine_ln_relu_dropout_backward(dout, cache):
"""
affine-ln-relu-dropout backward layer
"""
fc_cache, ln_cache, relu_cache, dropout_cache = cache
ddropout = dropout_backward(dout, dropout_cache)
drelu = relu_backward(ddropout, relu_cache)
dln, dgamma, dbeta = layernorm_backward(drelu, ln_cache)
dx, dw, db = affine_backward(dln, fc_cache)
return dx, dw, db, dgamma, dbeta
def affine_relu_dropout_forward(x, w, b, dropout_param):
a, fc_cache = affine_forward(x, w, b)
relu_out, relu_cache = relu_forward(a)
out, dropout_cache = dropout_forward(relu_out, dropout_param)
cache = (fc_cache, relu_cache, dropout_cache)
return out, cache
def affine_relu_dropout_backward(dout, cache):
fc_cache, relu_cache, dropout_cache = cache
ddropout = dropout_backward(dout, dropout_cache)
da = relu_backward(ddropout, relu_cache)
dx, dw, db = affine_backward(da, fc_cache)
return dx, dw, db
3- fc_net.py的修改,加入dropout判斷
其實在這段代碼中,可以有更簡化的寫法,不必每次都要判斷是否使用了dropout層,可以在其餘的函數實現中添加這一項的判斷,從而省去冗餘的判斷代碼,這裏只是不想改別的地方了,所以每次forward和backward都判斷了是否進行dropout,實際上會有更簡潔的寫法。
from builtins import range
from builtins import object
import numpy as np
from cs231n.layers import *
from cs231n.layer_utils import *
class TwoLayerNet(object):
"""
A two-layer fully-connected neural network with ReLU nonlinearity and
softmax loss that uses a modular layer design. We assume an input dimension
of D, a hidden dimension of H, and perform classification over C classes.
The architecure should be affine - relu - affine - softmax.
Note that this class does not implement gradient descent; instead, it
will interact with a separate Solver object that is responsible for running
optimization.
The learnable parameters of the model are stored in the dictionary
self.params that maps parameter names to numpy arrays.
"""
def __init__(self, input_dim=3*32*32, hidden_dim=100, num_classes=10,
weight_scale=1e-3, reg=0.0):
"""
Initialize a new network.
Inputs:
- input_dim: An integer giving the size of the input
- hidden_dim: An integer giving the size of the hidden layer
- num_classes: An integer giving the number of classes to classify
- weight_scale: Scalar giving the standard deviation for random
initialization of the weights.
- reg: Scalar giving L2 regularization strength.
"""
self.params = {}
self.reg = reg
############################################################################
# TODO: Initialize the weights and biases of the two-layer net. Weights #
# should be initialized from a Gaussian centered at 0.0 with #
# standard deviation equal to weight_scale, and biases should be #
# initialized to zero. All weights and biases should be stored in the #
# dictionary self.params, with first layer weights #
# and biases using the keys 'W1' and 'b1' and second layer #
# weights and biases using the keys 'W2' and 'b2'. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
self.params['W1'] = np.random.randn(input_dim, hidden_dim) * weight_scale
self.params['b1'] = np.zeros(hidden_dim)
self.params['W2'] = np.random.randn(hidden_dim, num_classes) * weight_scale
self.params['b2'] = np.zeros(num_classes)
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
def loss(self, X, y=None):
"""
Compute loss and gradient for a minibatch of data.
Inputs:
- X: Array of input data of shape (N, d_1, ..., d_k)
- y: Array of labels, of shape (N,). y[i] gives the label for X[i].
Returns:
If y is None, then run a test-time forward pass of the model and return:
- scores: Array of shape (N, C) giving classification scores, where
scores[i, c] is the classification score for X[i] and class c.
If y is not None, then run a training-time forward and backward pass and
return a tuple of:
- loss: Scalar value giving the loss
- grads: Dictionary with the same keys as self.params, mapping parameter
names to gradients of the loss with respect to those parameters.
"""
scores = None
############################################################################
# TODO: Implement the forward pass for the two-layer net, computing the #
# class scores for X and storing them in the scores variable. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
#z1 = (X.reshape(X.shape[0], -1)).dot(self.params['W1']) + self.params['b1']
z1, cache_fc1 = affine_forward(X, self.params['W1'], self.params['b1'])
a1, cache_relu = relu_forward(z1)
scores, cache_fc2 = affine_forward(a1, self.params['W2'], self.params['b2'])
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
# If y is None then we are in test mode so just return scores
if y is None:
return scores
loss, grads = 0, {}
############################################################################
# TODO: Implement the backward pass for the two-layer net. Store the loss #
# in the loss variable and gradients in the grads dictionary. Compute data #
# loss using softmax, and make sure that grads[k] holds the gradients for #
# self.params[k]. Don't forget to add L2 regularization! #
# #
# NOTE: To ensure that your implementation matches ours and you pass the #
# automated tests, make sure that your L2 regularization includes a factor #
# of 0.5 to simplify the expression for the gradient. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
N = X.shape[0]
# 自帶的softmax可以求出loss以及其對scores的偏導dsoftmax
loss, dsoftmax = softmax_loss(scores, y)
loss += 0.5 * self.reg * (np.sum(self.params['W2'] * self.params['W2']) + np.sum(self.params['W1'] * self.params['W1']))
relu_loss = dsoftmax.dot((self.params['W2']).T)
dx2, grads['W2'], grads['b2'] = affine_backward(dsoftmax, cache_fc2)
# 因爲在自帶的softmax_loss函數中,第一次求偏導時已經除以了數據總個數N,所以在求W1/W2時候就不用再除以N了
grads['W2'] = grads['W2'] + self.reg * self.params['W2']
relu_back = relu_backward(relu_loss, z1)
dx1, grads['W1'], grads['b1'] = affine_backward(relu_back, cache_fc1)
grads['W1'] = grads['W1'] + self.reg * self.params['W1']
pass
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
return loss, grads
class FullyConnectedNet(object):
"""
A fully-connected neural network with an arbitrary number of hidden layers,
ReLU nonlinearities, and a softmax loss function. This will also implement
dropout and batch/layer normalization as options. For a network with L layers,
the architecture will be
{affine - [batch/layer norm] - relu - [dropout]} x (L - 1) - affine - softmax
where batch/layer normalization and dropout are optional, and the {...} block is
repeated L - 1 times.
Similar to the TwoLayerNet above, learnable parameters are stored in the
self.params dictionary and will be learned using the Solver class.
"""
def __init__(self, hidden_dims, input_dim=3*32*32, num_classes=10,
dropout=1, normalization=None, reg=0.0,
weight_scale=1e-2, dtype=np.float32, seed=None):
"""
Initialize a new FullyConnectedNet.
Inputs:
- hidden_dims: A list of integers giving the size of each hidden layer.
- input_dim: An integer giving the size of the input.
- num_classes: An integer giving the number of classes to classify.
- dropout: Scalar between 0 and 1 giving dropout strength. If dropout=1 then
the network should not use dropout at all.
- normalization: What type of normalization the network should use. Valid values
are "batchnorm", "layernorm", or None for no normalization (the default).
- reg: Scalar giving L2 regularization strength.
- weight_scale: Scalar giving the standard deviation for random
initialization of the weights.
- dtype: A numpy datatype object; all computations will be performed using
this datatype. float32 is faster but less accurate, so you should use
float64 for numeric gradient checking.
- seed: If not None, then pass this random seed to the dropout layers. This
will make the dropout layers deteriminstic so we can gradient check the
model.
"""
self.normalization = normalization
self.use_dropout = dropout != 1
self.reg = reg
self.num_layers = 1 + len(hidden_dims)
self.dtype = dtype
self.params = {}
############################################################################
# TODO: Initialize the parameters of the network, storing all values in #
# the self.params dictionary. Store weights and biases for the first layer #
# in W1 and b1; for the second layer use W2 and b2, etc. Weights should be #
# initialized from a normal distribution centered at 0 with standard #
# deviation equal to weight_scale. Biases should be initialized to zero. #
# #
# When using batch normalization, store scale and shift parameters for the #
# first layer in gamma1 and beta1; for the second layer use gamma2 and #
# beta2, etc. Scale parameters should be initialized to ones and shift #
# parameters should be initialized to zeros. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
L = len(hidden_dims) # 一共有L個隱含層
np.random.seed(seed)
shape_one = input_dim
for i in range(L):
self.params['W'+str(i+1)] = np.random.randn(shape_one, hidden_dims[i]) * weight_scale
self.params['b'+str(i+1)] = np.zeros(hidden_dims[i])
shape_one = hidden_dims[i] # 把第一個參數的第二維度賦給後一個參數的第一維度,是連續的
if self.normalization != None:
self.params['gamma'+str(i+1)] = np.ones(hidden_dims[i])
self.params['beta'+str(i+1)] = np.zeros(hidden_dims[i])
self.params['W'+str(L+1)] = np.random.randn(shape_one, num_classes) * weight_scale # 最後一個全連接層,用於輸出
self.params['b'+str(L+1)] = np.zeros(num_classes)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
# When using dropout we need to pass a dropout_param dictionary to each
# dropout layer so that the layer knows the dropout probability and the mode
# (train / test). You can pass the same dropout_param to each dropout layer.
self.dropout_param = {}
if self.use_dropout:
self.dropout_param = {'mode': 'train', 'p': dropout}
if seed is not None:
self.dropout_param['seed'] = seed
# With batch normalization we need to keep track of running means and
# variances, so we need to pass a special bn_param object to each batch
# normalization layer. You should pass self.bn_params[0] to the forward pass
# of the first batch normalization layer, self.bn_params[1] to the forward
# pass of the second batch normalization layer, etc.
self.bn_params = []
if self.normalization=='batchnorm':
self.bn_params = [{'mode': 'train'} for i in range(self.num_layers - 1)] # 列表元素是字典,對應每一個mode
if self.normalization=='layernorm':
self.bn_params = [{} for i in range(self.num_layers - 1)]
# Cast all parameters to the correct datatype
for k, v in self.params.items():
self.params[k] = v.astype(dtype)
def loss(self, X, y=None):
"""
Compute loss and gradient for the fully-connected net.
Input / output: Same as TwoLayerNet above.
"""
X = X.astype(self.dtype) # 轉換數據類型
mode = 'test' if y is None else 'train'
# Set train/test mode for batchnorm params and dropout param since they
# behave differently during training and testing.
if self.use_dropout:
self.dropout_param['mode'] = mode
if self.normalization=='batchnorm':
for bn_param in self.bn_params:
bn_param['mode'] = mode
scores = None
############################################################################
# TODO: Implement the forward pass for the fully-connected net, computing #
# the class scores for X and storing them in the scores variable. #
# #
# When using dropout, you'll need to pass self.dropout_param to each #
# dropout forward pass. #
# #
# When using batch normalization, you'll need to pass self.bn_params[0] to #
# the forward pass for the first batch normalization layer, pass #
# self.bn_params[1] to the forward pass for the second batch normalization #
# layer, etc. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
if self.normalization == 'batchnorm':
bn_relu_cache = {}
input_layer = X
if self.use_dropout:
for i in range(self.num_layers-1):
bn_relu_out, bn_relu_cache[i] = affine_bn_relu_dropout_forward(input_layer, self.params['W'+str(i+1)], self.params['b'+str(i+1)], self.params['gamma'+str(i+1)], self.params['beta'+str(i+1)], self.bn_params[i], self.dropout_param)
input_layer = bn_relu_out
else:
for i in range(self.num_layers-1):
bn_relu_out, bn_relu_cache[i] = affine_bn_relu_forward(input_layer, self.params['W'+str(i+1)], self.params['b'+str(i+1)], self.params['gamma'+str(i+1)], self.params['beta'+str(i+1)], self.bn_params[i])
input_layer = bn_relu_out
# 最後一層是沒有relu的,只有全連接,這裏把cache放在relu_cache字典了只是爲了方便,其實這一層不算relu_cache
final_layer_out, bn_relu_cache[self.num_layers-1] = affine_forward(bn_relu_out, self.params['W'+str(self.num_layers)], self.params['b'+str(self.num_layers)])
scores = final_layer_out
elif self.normalization == 'layernorm':
ln_relu_cache = {}
input_layer = X
if self.use_dropout:
for i in range(self.num_layers-1):
ln_relu_out, ln_relu_cache[i] = affine_ln_relu_dropout_forward(input_layer, self.params['W'+str(i+1)], self.params['b'+str(i+1)], self.params['gamma'+str(i+1)], self.params['beta'+str(i+1)], self.bn_params[i], self.dropout_param)
input_layer = ln_relu_out
else:
for i in range(self.num_layers-1):
ln_relu_out, ln_relu_cache[i] = affine_ln_relu_forward(input_layer, self.params['W'+str(i+1)], self.params['b'+str(i+1)], self.params['gamma'+str(i+1)], self.params['beta'+str(i+1)], self.bn_params[i])
input_layer = ln_relu_out
# 最後一層是沒有relu的,只有全連接,這裏把cache放在relu_cache字典了只是爲了方便,其實這一層不算relu_cache
final_layer_out, ln_relu_cache[self.num_layers-1] = affine_forward(ln_relu_out, self.params['W'+str(self.num_layers)], self.params['b'+str(self.num_layers)])
scores = final_layer_out
else:
relu_cache = {}
input_layer = X
if self.use_dropout:
for i in range(self.num_layers-1):
relu_out, relu_cache[i] = affine_relu_dropout_forward(input_layer, self.params['W'+str(i+1)], self.params['b'+str(i+1)], self.dropout_param)
input_layer = relu_out
else:
for i in range(self.num_layers-1):
relu_out, relu_cache[i] = affine_relu_forward(input_layer, self.params['W'+str(i+1)], self.params['b'+str(i+1)])
input_layer = relu_out
# 最後一層是沒有relu的,只有全連接,這裏把cache放在relu_cache字典了只是爲了方便,其實這一層不算relu_cache
final_layer_out, relu_cache[self.num_layers-1] = affine_forward(relu_out, self.params['W'+str(self.num_layers)], self.params['b'+str(self.num_layers)])
scores = final_layer_out
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
# If test mode return early
if mode == 'test':
return scores
loss, grads = 0.0, {}
############################################################################
# TODO: Implement the backward pass for the fully-connected net. Store the #
# loss in the loss variable and gradients in the grads dictionary. Compute #
# data loss using softmax, and make sure that grads[k] holds the gradients #
# for self.params[k]. Don't forget to add L2 regularization! #
# #
# When using batch/layer normalization, you don't need to regularize the scale #
# and shift parameters. #
# #
# NOTE: To ensure that your implementation matches ours and you pass the #
# automated tests, make sure that your L2 regularization includes a factor #
# of 0.5 to simplify the expression for the gradient. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
loss, dout = softmax_loss(scores, y)
for i in range(self.num_layers): # 正則化求loss的最終值
loss += 0.5 * self.reg * np.sum(self.params['W'+str(i+1)] * self.params['W'+str(i+1)])
if self.normalization == 'batchnorm':
# 最後一層的第一次反向傳播,因爲變量要重新賦值,這一次單獨拿出來
dx, final_dw, final_db = affine_backward(dout, bn_relu_cache[self.num_layers-1])
grads['W'+str(self.num_layers)] = final_dw + self.reg * self.params['W'+str(self.num_layers)]
grads['b'+str(self.num_layers)] = final_db
if self.use_dropout:
for i in range(self.num_layers-1, 0, -1):
dx, dw, db, dgamma, dbeta = affine_bn_relu_dropout_backward(dx, bn_relu_cache[i-1])
grads['W'+str(i)] = dw + self.reg * self.params['W'+str(i)]
grads['b'+str(i)] = db
grads['gamma'+str(i)] = dgamma
grads['beta'+str(i)] = dbeta
# 對剩餘的層可以用循環來求解
else:
for i in range(self.num_layers-1, 0, -1):
dx, dw, db, dgamma, dbeta = affine_bn_relu_backward(dx, bn_relu_cache[i-1])
grads['W'+str(i)] = dw + self.reg * self.params['W'+str(i)]
grads['b'+str(i)] = db
grads['gamma'+str(i)] = dgamma
grads['beta'+str(i)] = dbeta
elif self.normalization == 'layernorm':
# 最後一層的第一次反向傳播,因爲變量要重新賦值,這一次單獨拿出來
dx, final_dw, final_db = affine_backward(dout, ln_relu_cache[self.num_layers-1])
grads['W'+str(self.num_layers)] = final_dw + self.reg * self.params['W'+str(self.num_layers)]
grads['b'+str(self.num_layers)] = final_db
if self.use_dropout:
for i in range(self.num_layers-1, 0, -1):
dx, dw, db, dgamma, dbeta = affine_ln_relu_dropout_backward(dx, ln_relu_cache[i-1])
grads['W'+str(i)] = dw + self.reg * self.params['W'+str(i)]
grads['b'+str(i)] = db
grads['gamma'+str(i)] = dgamma
grads['beta'+str(i)] = dbeta
# 對剩餘的層可以用循環來求解
else:
for i in range(self.num_layers-1, 0, -1):
dx, dw, db, dgamma, dbeta = affine_ln_relu_backward(dx, ln_relu_cache[i-1])
grads['W'+str(i)] = dw + self.reg * self.params['W'+str(i)]
grads['b'+str(i)] = db
grads['gamma'+str(i)] = dgamma
grads['beta'+str(i)] = dbeta
else:
# 最後一層的第一次反向傳播,因爲變量要重新賦值,這一次單獨拿出來
dx, final_dw, final_db = affine_backward(dout, relu_cache[self.num_layers-1])
grads['W'+str(self.num_layers)] = final_dw + self.reg * self.params['W'+str(self.num_layers)]
grads['b'+str(self.num_layers)] = final_db
if self.use_dropout:
for i in range(self.num_layers-1, 0, -1):
dx, dw, db = affine_relu_dropout_backward(dx, relu_cache[i-1])
grads['W'+str(i)] = dw + self.reg * self.params['W'+str(i)]
grads['b'+str(i)] = db
# 對剩餘的層可以用循環來求解
else:
for i in range(self.num_layers-1, 0, -1):
dx, dw, db = affine_relu_backward(dx, relu_cache[i-1])
grads['W'+str(i)] = dw + self.reg * self.params['W'+str(i)]
grads['b'+str(i)] = db
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
return loss, grads