tensorflow2 寫網絡結構,賊優雅

原創: [email protected]
時間: 2020/06/10
coding: google-research/kws_streaming/models/cnn.py

  • step1. 用argparse設定超參,包括卷積核等

    # sub parser for model settings
    subparsers = parser.add_subparsers(dest='model_name', help='NN model name')
    
    # CNN model settings
    parser_cnn = subparsers.add_parser('cnn')
    cnn.model_parameters(parser_cnn)
    
  • step2. build cnn

    model = cnn.model(flags)
    
    # 用zip 來解析超參,這一步簡直是神來之筆,省了好多調用
     for filters, kernel_size, activation, dilation_rate, strides in zip(
      parse(flags.cnn_filters), parse(flags.cnn_kernel_size),
      parse(flags.cnn_act), parse(flags.cnn_dilation_rate),
      parse(flags.cnn_strides))
    

    parse 函數:

    def parse(text):
    	if not text:
    	    return []
    	
    	# ast.literal_eval() 也是一個很有意思的東西,
        # 建議花點時間學習一下
        # 功能:將字符串還原成它能夠轉化成的數據類型
    	res = ast.literal_eval(text)
    	if isinstance(res, tuple):
    	    return res
    	else:
    	    return [res]
    

    接下來就是搭建網絡了,用for循環,搭建7個卷積層,
    忽略掉Stream()就是普通的卷積函數,
    然後跟上一些其他的層:

    • Flattern()
    • Dropout()
    • 兩個FC,也是for循環
    • 最後一層輸出層
  • step3. 查看網絡結構

    logging.info(model.summary())
    
# coding=utf-8
# Copyright 2020 The Google Research Authors.

# cnn.py
"""CNN model with Mel spectrum."""
from kws_streaming.layers import modes
from kws_streaming.layers import speech_features
from kws_streaming.layers.compat import tf
from kws_streaming.layers.stream import Stream
from kws_streaming.models.utils import parse


def model_parameters(parser_nn):
  """Covolutional Neural Network(CNN) model parameters."""

  parser_nn.add_argument(
      '--cnn_filters',
      type=str,
      default='64,64,64,64,128,64,128',
      help='Number of output filters in the convolution layers',
  )
  parser_nn.add_argument(
      '--cnn_kernel_size',
      type=str,
      default='(3,3),(5,3),(5,3),(5,3),(5,2),(5,1),(10,1)',
      help='Heights and widths of the 2D convolution window',
  )
  parser_nn.add_argument(
      '--cnn_act',
      type=str,
      default="'relu','relu','relu','relu','relu','relu','relu'",
      help='Activation function in the convolution layers',
  )
  parser_nn.add_argument(
      '--cnn_dilation_rate',
      type=str,
      default='(1,1),(1,1),(2,1),(1,1),(2,1),(1,1),(2,1)',
      help='Dilation rate to use for dilated convolutions',
  )
  parser_nn.add_argument(
      '--cnn_strides',
      type=str,
      default='(1,1),(1,1),(1,1),(1,1),(1,1),(1,1),(1,1)',
      help='Strides of the convolution layers along the height and width',
  )
  parser_nn.add_argument(
      '--dropout1',
      type=float,
      default=0.5,
      help='Percentage of data dropped',
  )
  parser_nn.add_argument(
      '--units2',
      type=str,
      default='128,256',
      help='Number of units in the last set of hidden layers',
  )
  parser_nn.add_argument(
      '--act2',
      type=str,
      default="'linear','relu'",
      help='Activation function of the last set of hidden layers',
  )


def model(flags):
  """CNN model.

  It is based on paper:
  Convolutional Neural Networks for Small-footprint Keyword Spotting
  http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  net = tf.keras.backend.expand_dims(net)
  for filters, kernel_size, activation, dilation_rate, strides in zip(
      parse(flags.cnn_filters), parse(flags.cnn_kernel_size),
      parse(flags.cnn_act), parse(flags.cnn_dilation_rate),
      parse(flags.cnn_strides)):
    net = Stream(
        cell=tf.keras.layers.Conv2D(
            filters=filters,
            kernel_size=kernel_size,
            activation=activation,
            dilation_rate=dilation_rate,
            strides=strides))(
                net)

  net = Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(parse(flags.units2), parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  return tf.keras.Model(input_audio, net)

總結:
用(for + zip)的方式可以省去好多重複代碼,網絡結構搭建的很賞心悅目,配上網絡參數用argparse設定,見過的最優雅的CNN代碼構建,沒有之一


# 附上之前的CNN 構建源碼 `2 Conv2d + linear + 2 FC`
def create_cnn_model2(fingerprint_input, model_settings, model_size_info,
                       is_training):
  """Builds a model with 2 convolution layers followed by a linear layer and 
      a hidden fully-connected layer.
  model_size_info: defines the first and second convolution parameters in
      {number of conv features, conv filter height, width, stride in y,x dir.},
      followed by linear layer size and fully-connected layer size.
  """
  if is_training:
    dropout_prob = tf.placeholder(tf.float32, name='dropout_prob')
  input_frequency_size = model_settings['dct_coefficient_count']
  input_time_size = model_settings['spectrogram_length']
  fingerprint_4d = tf.reshape(fingerprint_input,
                              [-1, input_time_size, input_frequency_size, 1])

  first_filter_count = model_size_info[0] 
  first_filter_height = model_size_info[1]   #time axis
  first_filter_width = model_size_info[2]    #frequency axis
  first_filter_stride_y = model_size_info[3] #time axis
  first_filter_stride_x = model_size_info[4] #frequency_axis

  second_filter_count = model_size_info[5] 
  second_filter_height = model_size_info[6]   #time axis
  second_filter_width = model_size_info[7]    #frequency axis
  second_filter_stride_y = model_size_info[8] #time axis
  second_filter_stride_x = model_size_info[9] #frequency_axis
 
  linear_layer_size = model_size_info[10]
  fc_size = model_size_info[11]

  # first conv
  first_weights = tf.Variable(
      tf.truncated_normal(
          [first_filter_height, first_filter_width, 1, first_filter_count],
          stddev=0.01))
  first_bias = tf.Variable(tf.zeros([first_filter_count]))
  first_conv = tf.nn.conv2d(fingerprint_4d, first_weights, [
      1, first_filter_stride_y, first_filter_stride_x, 1
  ], 'VALID') + first_bias

  first_conv = tf.layers.batch_normalization(first_conv, training=is_training,
                 name='bn1')

  first_relu = tf.nn.relu(first_conv)
  if is_training:
    first_dropout = tf.nn.dropout(first_relu, dropout_prob)
  else:
    first_dropout = first_relu
  first_conv_output_width = math.ceil(
      (input_frequency_size - first_filter_width + 1) /
      first_filter_stride_x)
  first_conv_output_height = math.ceil(
      (input_time_size - first_filter_height + 1) /
      first_filter_stride_y)

  # second conv
  second_weights = tf.Variable(
      tf.truncated_normal(
          [second_filter_height, second_filter_width, first_filter_count, 
             second_filter_count],
          stddev=0.01))
  second_bias = tf.Variable(tf.zeros([second_filter_count]))
  second_conv = tf.nn.conv2d(first_dropout, second_weights, [
      1, second_filter_stride_y, second_filter_stride_x, 1
  ], 'VALID') + second_bias

  ##### update 2020/05/21 #######
  second_conv = tf.layers.batch_normalization(second_conv, training=is_training,
                  name='bn2')

  second_relu = tf.nn.relu(second_conv)
  if is_training:
    second_dropout = tf.nn.dropout(second_relu, dropout_prob)
  else:
    second_dropout = second_relu
  second_conv_output_width = math.ceil(
      (first_conv_output_width - second_filter_width + 1) /
      second_filter_stride_x)
  second_conv_output_height = math.ceil(
      (first_conv_output_height - second_filter_height + 1) /
      second_filter_stride_y)
  second_conv_element_count = int(
      second_conv_output_width*second_conv_output_height*second_filter_count)
  flattened_second_conv = tf.reshape(second_dropout,
                                    [-1, second_conv_element_count])

  # linear layer
  W = tf.get_variable('W', shape=[second_conv_element_count, linear_layer_size],
        initializer=tf.contrib.layers.xavier_initializer())
  b = tf.get_variable('b', shape=[linear_layer_size])
  flow = tf.matmul(flattened_second_conv, W) + b

  # first fc
  first_fc_output_channels = fc_size
  first_fc_weights = tf.Variable(
      tf.truncated_normal(
          [linear_layer_size, first_fc_output_channels], stddev=0.01))
  first_fc_bias = tf.Variable(tf.zeros([first_fc_output_channels]))
  first_fc = tf.matmul(flow, first_fc_weights) + first_fc_bias

  first_fc = tf.layers.batch_normalization(first_fc, training=is_training,
               name='bn3')


  first_fc = tf.nn.relu(first_fc)
  if is_training:
    final_fc_input = tf.nn.dropout(first_fc, dropout_prob)
  else:
    final_fc_input = first_fc
  label_count = model_settings['label_count']
  final_fc_weights = tf.Variable(
      tf.truncated_normal(
          [first_fc_output_channels, label_count], stddev=0.01))
  final_fc_bias = tf.Variable(tf.zeros([label_count]))
  final_fc = tf.matmul(final_fc_input, final_fc_weights) + final_fc_bias
  if is_training:
    return final_fc, dropout_prob
  else:
    return final_fc
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章