簡介
該論文提出了一種新的模型縮放方法,它使用一個簡單而高效的複合係數來以更結構化的方式放大 CNNs。 不像傳統的方法那樣任意縮放網絡維度,如寬度,深度和分辨率,該論文的方法用一系列固定的尺度縮放係數來統一縮放網絡維度。 通過使用這種新穎的縮放方法和 AutoML[5] 技術,作者將這種模型稱爲 EfficientNets ,它具有最高達10倍的效率(更小、更快)。
區別
第一處區別是在最開始降採樣沒有采用maxpooling,而是換成了stride爲2的conv。。猜測是爲了減少信息丟失,尤其是對小模型來說,前期的底層特徵提取更重要。
第二處區別是第一次降採樣後的channel反而減少了,這個我沒搞懂。。。。
第三處區別是有很多stage都採用了5x5的conv。。。這是因爲對於depthwise separable conv來說,5x5的計算量要比兩個3x3的計算量要小。。(坊間傳聞large kernel is all your need. 233333)
其中輸入特徵圖尺寸爲(H, W, M),輸出特徵圖尺寸爲(H, W, N)。
第四處區別是降採樣後的特徵圖尺寸減半,但是channel沒有擴大兩倍。第6個stage特徵圖尺寸沒變,但是channel也擴大了。。這些可能都是手工設計很難搞定的。。。我能想到的解釋是,MnasNet在搜網絡結構的時候帶上了運算量的約束,可以理解成網絡在訓練的時候就考慮了pruing(裁枝),因此纔會出現一些不規則的channel數,同時這個帶來另外的一個好處就是,網絡可以更好的訓練更有意義的權重,因此這些搜出來的網絡結構的上限更高。
通過對模型效率的顯著改進,預計 EfficientNets 可能成爲未來計算機視覺任務的新基礎。
import tensorflow as tf
import math
NUM_CLASSES = 10
def swish(x):
return x * tf.nn.sigmoid(x)
def round_filters(filters, multiplier):
depth_divisor = 8
min_depth = None
min_depth = min_depth or depth_divisor
filters = filters * multiplier
new_filters = max(min_depth, int(filters + depth_divisor / 2) // depth_divisor * depth_divisor)
if new_filters < 0.9 * filters:
new_filters += depth_divisor
return int(new_filters)
def round_repeats(repeats, multiplier):
if not multiplier:
return repeats
return int(math.ceil(multiplier * repeats))
class SEBlock(tf.keras.layers.Layer):
def __init__(self, input_channels, ratio=0.25):
super(SEBlock, self).__init__()
self.num_reduced_filters = max(1, int(input_channels * ratio))
self.pool = tf.keras.layers.GlobalAveragePooling2D()
self.reduce_conv = tf.keras.layers.Conv2D(filters=self.num_reduced_filters,
kernel_size=(1, 1),
strides=1,
padding="same")
self.expand_conv = tf.keras.layers.Conv2D(filters=input_channels,
kernel_size=(1, 1),
strides=1,
padding="same")
def call(self, inputs, **kwargs):
branch = self.pool(inputs)
branch = tf.expand_dims(input=branch, axis=1)
branch = tf.expand_dims(input=branch, axis=1)
branch = self.reduce_conv(branch)
branch = swish(branch)
branch = self.expand_conv(branch)
branch = tf.nn.sigmoid(branch)
output = inputs * branch
return output
class MBConv(tf.keras.layers.Layer):
def __init__(self, in_channels, out_channels, expansion_factor, stride, k, drop_connect_rate):
super(MBConv, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.stride = stride
self.drop_connect_rate = drop_connect_rate
self.conv1 = tf.keras.layers.Conv2D(filters=in_channels * expansion_factor,
kernel_size=(1, 1),
strides=1,
padding="same")
self.bn1 = tf.keras.layers.BatchNormalization()
self.dwconv = tf.keras.layers.DepthwiseConv2D(kernel_size=(k, k),
strides=stride,
padding="same")
self.bn2 = tf.keras.layers.BatchNormalization()
self.se = SEBlock(input_channels=in_channels * expansion_factor)
self.conv2 = tf.keras.layers.Conv2D(filters=out_channels,
kernel_size=(1, 1),
strides=1,
padding="same")
self.bn3 = tf.keras.layers.BatchNormalization()
self.dropout = tf.keras.layers.Dropout(rate=drop_connect_rate)
def call(self, inputs, training=None, **kwargs):
x = self.conv1(inputs)
x = self.bn1(x, training=training)
x = swish(x)
x = self.dwconv(x)
x = self.bn2(x, training=training)
x = self.se(x)
x = swish(x)
x = self.conv2(x)
x = self.bn3(x, training=training)
if self.stride == 1 and self.in_channels == self.out_channels:
if self.drop_connect_rate:
x = self.dropout(x, training=training)
x = tf.keras.layers.add([x, inputs])
return x
def build_mbconv_block(in_channels, out_channels, layers, stride, expansion_factor, k, drop_connect_rate):
block = tf.keras.Sequential()
for i in range(layers):
if i == 0:
block.add(MBConv(in_channels=in_channels,
out_channels=out_channels,
expansion_factor=expansion_factor,
stride=stride,
k=k,
drop_connect_rate=drop_connect_rate))
else:
block.add(MBConv(in_channels=out_channels,
out_channels=out_channels,
expansion_factor=expansion_factor,
stride=1,
k=k,
drop_connect_rate=drop_connect_rate))
return block
class EfficientNet(tf.keras.Model):
def __init__(self, width_coefficient, depth_coefficient, dropout_rate, drop_connect_rate=0.2):
super(EfficientNet, self).__init__()
self.conv1 = tf.keras.layers.Conv2D(filters=round_filters(32, width_coefficient),
kernel_size=(3, 3),
strides=2,
padding="same")
self.bn1 = tf.keras.layers.BatchNormalization()
self.block1 = build_mbconv_block(in_channels=round_filters(32, width_coefficient),
out_channels=round_filters(16, width_coefficient),
layers=round_repeats(1, depth_coefficient),
stride=1,
expansion_factor=1, k=3, drop_connect_rate=drop_connect_rate)
self.block2 = build_mbconv_block(in_channels=round_filters(16, width_coefficient),
out_channels=round_filters(24, width_coefficient),
layers=round_repeats(2, depth_coefficient),
stride=2,
expansion_factor=6, k=3, drop_connect_rate=drop_connect_rate)
self.block3 = build_mbconv_block(in_channels=round_filters(24, width_coefficient),
out_channels=round_filters(40, width_coefficient),
layers=round_repeats(2, depth_coefficient),
stride=2,
expansion_factor=6, k=5, drop_connect_rate=drop_connect_rate)
self.block4 = build_mbconv_block(in_channels=round_filters(40, width_coefficient),
out_channels=round_filters(80, width_coefficient),
layers=round_repeats(3, depth_coefficient),
stride=2,
expansion_factor=6, k=3, drop_connect_rate=drop_connect_rate)
self.block5 = build_mbconv_block(in_channels=round_filters(80, width_coefficient),
out_channels=round_filters(112, width_coefficient),
layers=round_repeats(3, depth_coefficient),
stride=1,
expansion_factor=6, k=5, drop_connect_rate=drop_connect_rate)
self.block6 = build_mbconv_block(in_channels=round_filters(112, width_coefficient),
out_channels=round_filters(192, width_coefficient),
layers=round_repeats(4, depth_coefficient),
stride=2,
expansion_factor=6, k=5, drop_connect_rate=drop_connect_rate)
self.block7 = build_mbconv_block(in_channels=round_filters(192, width_coefficient),
out_channels=round_filters(320, width_coefficient),
layers=round_repeats(1, depth_coefficient),
stride=1,
expansion_factor=6, k=3, drop_connect_rate=drop_connect_rate)
self.conv2 = tf.keras.layers.Conv2D(filters=round_filters(1280, width_coefficient),
kernel_size=(1, 1),
strides=1,
padding="same")
self.bn2 = tf.keras.layers.BatchNormalization()
self.pool = tf.keras.layers.GlobalAveragePooling2D()
self.dropout = tf.keras.layers.Dropout(rate=dropout_rate)
self.fc = tf.keras.layers.Dense(units=NUM_CLASSES,
activation=tf.keras.activations.softmax)
def call(self, inputs, training=None, mask=None):
x = self.conv1(inputs)
x = self.bn1(x, training=training)
x = swish(x)
x = self.block1(x)
x = self.block2(x)
x = self.block3(x)
x = self.block4(x)
x = self.block5(x)
x = self.block6(x)
x = self.block7(x)
x = self.conv2(x)
x = self.bn2(x, training=training)
x = swish(x)
x = self.pool(x)
x = self.dropout(x, training=training)
x = self.fc(x)
return x
def get_efficient_net(width_coefficient, depth_coefficient, resolution, dropout_rate):
net = EfficientNet(width_coefficient=width_coefficient,
depth_coefficient=depth_coefficient,
dropout_rate=dropout_rate)
net.build(input_shape=(None, resolution, resolution, 3))
net.summary()
return net
def efficient_net_b0():
return get_efficient_net(1.0, 1.0, 224, 0.2)
def efficient_net_b1():
return get_efficient_net(1.0, 1.1, 240, 0.2)
def efficient_net_b2():
return get_efficient_net(1.1, 1.2, 260, 0.3)
def efficient_net_b3():
return get_efficient_net(1.2, 1.4, 300, 0.3)
def efficient_net_b4():
return get_efficient_net(1.4, 1.8, 380, 0.4)
def efficient_net_b5():
return get_efficient_net(1.6, 2.2, 456, 0.4)
def efficient_net_b6():
return get_efficient_net(1.8, 2.6, 528, 0.5)
def efficient_net_b7():
return get_efficient_net(2.0, 3.1, 600, 0.5)