論文:Segmentation-Based Deep-Learning Approach for Surface-Defect Detection
環境:python3.5, cuda10 ,cudnn7.6 , tensorflow1.13
參考:https://blog.csdn.net/jinxiaonian11/article/details/94316567
記錄原因:
- SegDecNet爲two-stage網絡
- 熟悉tensorflow同步的數據並行方法
數據同步的原理
每個gpu中都存有模型,並共享所有的變量,分別在不同的gpu中計算不同batch的數據,得到loss_1,loss_2,…,loss_num_gpus,最後將所有loss放到cpu中進行mean,最後update。
代碼
def average_gradients(self,tower_grads):
average_grads=[]
# tower_grads保存的是gradients和weight的元組
for grad_and_vars in zip(*tower_grads):
grads=[]
for g,_ in grad_and_vars:
expend_g=tf.expand_dims(g,0)
grads.append(expend_g)
grad=tf.concat(grads,0)
grad=tf.reduce_mean(grad,0)
v=grad_and_vars[0][1]
grad_and_var=(grad,v)
average_grads.append(grad_and_var)
return average_grads
def build_model(self):
def SegmentNet(input, scope, is_training, reuse=None):
with tf.variable_scope(scope, reuse=reuse):
with slim.arg_scope([slim.conv2d],
padding='SAME',
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm):
net = slim.conv2d(input, 32, [5, 5],scope='conv1')
net = slim.conv2d(net, 32, [5, 5], scope='conv2')
net=slim.max_pool2d(net,[2,2],[2,2],scope='pool1')
net = slim.conv2d(net, 64, [5, 5],scope='conv3')
net = slim.conv2d(net, 64, [5, 5], scope='conv4')
net = slim.conv2d(net, 64, [5, 5], scope='conv5')
net=slim.max_pool2d(net,[2,2],[2,2],scope='pool2')
net = slim.conv2d(net, 64, [5, 5],scope='conv6')
net = slim.conv2d(net, 64, [5, 5], scope='conv7')
net = slim.conv2d(net, 64, [5, 5],scope='conv8')
net = slim.conv2d(net, 64, [5, 5], scope='conv9')
net=slim.max_pool2d(net,[2,2],[2,2],scope='pool3')
net = slim.conv2d(net, 1024, [15, 15], scope='conv10')
features=net
net = slim.conv2d(net, 1, [1, 1],activation_fn=None, scope='conv11')
logits_pixel=net
net=tf.sigmoid(net, name=None)
mask=net
return features,logits_pixel,mask
def DecisionNet(feature,mask, scope, is_training,num_classes=2, reuse=None):
with tf.variable_scope(scope, reuse=reuse):
with slim.arg_scope([slim.conv2d],
padding='SAME',
activation_fn=tf.nn.relu,
normalizer_fn=slim.batch_norm):
net=tf.concat([feature,mask],axis=3)
net = slim.max_pool2d(net, [2, 2], [2, 2], scope='pool1')
net = slim.conv2d(net, 8, [5, 5], scope='conv1')
net = slim.max_pool2d(net, [2, 2], [2, 2], scope='pool2')
net = slim.conv2d(net, 16, [5, 5], scope='conv2')
net = slim.max_pool2d(net, [2, 2], [2, 2], scope='pool3')
net = slim.conv2d(net, 32, [5, 5], scope='conv3')
vector1=math_ops.reduce_mean(net,[1,2],name='pool4', keepdims=True)
vector2=math_ops.reduce_max(net,[1,2],name='pool5', keepdims=True)
vector3=math_ops.reduce_mean(mask,[1,2],name='pool6', keepdims=True)
vector4=math_ops.reduce_max(mask,[1,2],name='pool7', keepdims=True)
vector=tf.concat([vector1,vector2,vector3,vector4],axis=3)
vector=tf.squeeze(vector,axis=[1,2])
logits = slim.fully_connected(vector, num_classes,activation_fn=None)
output=tf.argmax(logits,axis=1,name='output')
return logits,output
# 多GPU的網絡模型結果(關鍵是根據self.__num_gpus的取值來決定loss計算方式)
tower_grads=[]
tower_grads_seg=[]
tower_grads_dec=[]
# 網絡的整體輸入
Image = tf.placeholder(tf.float32, shape=(None, IMAGE_SIZE[0],IMAGE_SIZE[1], 1), name='Image')
PixelLabel=tf.placeholder(tf.float32,shape=(None, IMAGE_SIZE[0]/8,IMAGE_SIZE[1]/8, 1), name='PixelLabel')
Label=tf.placeholder(tf.int32, shape=(None), name='Label')
opt=tf.train.GradientDescentOptimizer(self.__learn_rate)
with tf.variable_scope(tf.get_variable_scope()):
# 當測試的時候,網絡計算應該默認gpu爲1
itera=self.__num_gpus if self.__mode == 'training' else 1
for i in range(itera):
# 用以統計的全局值
loss_pixel_global=0
loss_class_global=0
loss_total_global=0
mask_lst=[]
logits_pixel_lst=[]
# 設置在第i塊GPU上進行訓練
with tf.device(utils.assign_to_device('/gpu:{}'.format(i),ps_device='/cpu:0')):
# 取對應batch的數據進行訓練
_Image=Image[i*self.__batch_size:(i+1)*self.__batch_size]
_PixelLabel=PixelLabel[i*self.__batch_size:(i+1)*self.__batch_size]
_Label=Label[i*self.__batch_size:(i+1)*self.__batch_size]
features, logits_pixel, mask=SegmentNet(_Image,'segment',self.is_training)
logits_class,output_class=DecisionNet(features,mask, 'decision', self.is_training)
logits_pixel=tf.reshape(logits_pixel,[self.__batch_size,-1])
PixelLabel_reshape=tf.reshape(_PixelLabel,[self.__batch_size,-1])
mask_lst.append(mask)
logits_pixel_lst.append(logits_pixel)
# 兩個stage的loss計算
loss_pixel=tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits_pixel,labels=PixelLabel_reshape))
loss_class = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_class,labels=_Label))
loss_total=loss_pixel+loss_class
loss_pixel_global+=loss_pixel
loss_class_global+=loss_class
loss_total_global+=loss_total
# 獲取變量列表
train_var_list = [v for v in tf.trainable_variables() ]
train_segment_var_list = [v for v in tf.trainable_variables() if 'segment' in v.name ]
train_decision_var_list = [v for v in tf.trainable_variables() if 'decision' in v.name]
# 計算grad
grads_pixel=opt.compute_gradients(loss_pixel,train_segment_var_list)
grads_class=opt.compute_gradients(loss_class,train_decision_var_list)
grads_total=opt.compute_gradients(loss_total,train_var_list)
# 記錄不同gpus算出的grad
tower_grads.append(grads_total)
tower_grads_dec.append(grads_class)
tower_grads_seg.append(grads_pixel)
# 共享變量
tf.get_variable_scope().reuse_variables()
# 做mean
grads=self.average_gradients(tower_grads)
grads_seg=self.average_gradients(tower_grads_seg)
grads_dec=self.average_gradients(tower_grads_dec)
# 更新gradient
train_op=opt.apply_gradients(grads)
train_op_seg=opt.apply_gradients(grads_seg)
train_op_dec=opt.apply_gradients(grads_dec)
init_op=tf.global_variables_initializer()
# 以下均爲第-1個gpu的變量值
self.Image=Image
self.PixelLabel = PixelLabel
self.Label = Label
self.features = features
self.logits_class=logits_class
self.output_class=output_class
# 以下爲n個gpu訓練n*num_gpus的全局變量值
self.loss_pixel = loss_pixel_global
self.loss_class = loss_class_global
self.loss_total = loss_total_global
self.logits_pixel=logits_pixel_lst
self.mask = mask_lst
self.init_op=init_op
self.train_op=train_op
self.train_op_seg=train_op_seg
self.train_op_dec=train_op_dec
utils.py中的assign_to_device方法
PS_OPS = ['Variable', 'VariableV2', 'AutoReloadVariable']
def assign_to_device(device, ps_device='/cpu:0'):
def _assign(op):
node_def = op if isinstance(op, tf.NodeDef) else op.node_def
if node_def.op in PS_OPS:
return ps_device
else:
return device
return _assign
opt.compute和opt.apply_gradient
前者用來計算梯度,後者用來更新對應的variable,兩者配合使用,和optimizer.minimize作用相同。
def minimize(self,loss,global_step=None,var_list=None,name=None):
# 輸出爲[(gradient,variable),(gradient,variable),...]
grads_and_vars=self.compute_gradient(loss,var_list=var_list)
vars_with_grad=[v for g,v in grads_and_vars if g is not None]
if not vars_with_grad:
raise ValueError(...)
return self.apply_gradients(grads_and_vars, global_step=global_step,name=name)
average_gradient方法
tower_grads的list長度爲gpu個數
tower_grads[i]代表第i個gpu的所有變量和對應的梯度信息,是元組的list,每個元祖的形式是(tf.Tensor (gradient for …), tf.Variable(…)),即每個variable以及對應的梯度。所以命名爲“梯度塔
注意點:
- 論文的其他部分也要有對應的小修改
- build_model()最後設置了一系列的成員變量,不管是基於哪個範圍的,都是用於展示作用的(我是用來可視化過程loss和過程mask圖)
總的來說,相對於源碼的改動就是寫了個循環,然後將minimize拆分,最後計算出一個num_gpus*batchsize的數據量後,算一個mean,然後再利用apply_gradients更新參數的值。