本部分剖析Caffe中Net::Backward()函數，即反向傳播計算過程。從LeNet網絡角度出發，且調試網絡爲訓練網絡，共9層網絡。

入口信息

Net::Backward()函數中調用BackwardFromTo函數，從網絡最後一層到網絡第一層反向調用每個網絡層的Backward。

void Net<Dtype>::BackwardFromTo(int start, int end) {
  for (int i = start; i >= end; --i) {
    if (layer_need_backward_[i]) {
      layers_[i]->Backward(
          top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
      if (debug_info_) { BackwardDebugInfo(i); }
    }
  }
}

第九層 SoftmaxWithLossLayer

代碼實現如下：

void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {

    // bottom_diff shape:64*10
    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
    // prob_data shape:64*10
    const Dtype* prob_data = prob_.gpu_data();
    // top_data shape:(1)
    const Dtype* top_data = top[0]->gpu_data();
    // 將Softmax層預測的結果prob複製到bottom_diff中
    caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
    // label shape:64*1
    const Dtype* label = bottom[1]->gpu_data();
    // dim = 640 / 64 = 10
    const int dim = prob_.count() / outer_num_;
    // nthreads = 64 / 1 = 64
    const int nthreads = outer_num_ * inner_num_;
    // Since this memory is never used for anything else,
    // we use to to avoid allocating new GPU memory.
    Dtype* counts = prob_.mutable_gpu_diff();

    // 該函數將bottom_diff（此時爲每個類的預測概率）對應的正確類別（label）的概率值-1，其他數據沒變。見公式推導。
    SoftmaxLossBackwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
        CAFFE_CUDA_NUM_THREADS>>>(nthreads, top_data, label, bottom_diff,
        outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
    // 代碼展開開始,代碼有修改
    __global__ void SoftmaxLossBackwardGPU(...) {
      CUDA_KERNEL_LOOP(index, nthreads) { 
        const int label_value = static_cast<int>(label[index]);
        bottom_diff[index * dim + label_value] -= 1;
        counts[index] = 1;        
      }
    }
    // 代碼展開結束

    Dtype valid_count = -1;
    // 注意爲loss的權值，對該權值（一般爲1或者0）歸一化（除以64）
    // Scale gradient
    const Dtype loss_weight = top[0]->cpu_diff()[0];
    if (normalize_) {
      caffe_scal(prob_.count(), loss_weight / count, bottom_diff);
    } else {
      caffe_scal(prob_.count(), loss_weight / outer_num_, bottom_diff);
    }

}

說明：

SoftmaxWithLossLayer是沒有學習參數的，因此不需要對該層的參數做調整，只需要計算bottom_diff（理解反向傳播算法的鏈式求導，求bottom_diff對上一層的輸出求導，是爲了進一步計算調整上一層權值）
以上代碼核心部分在SoftmaxLossBackwardGPU。該函數將bottom_diff（此時爲每個類的預測概率）對應的正確類別（label）的概率值-1，其他數據沒變。

第八層 InnerProduct

template <typename Dtype>
void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down,
    const vector<Blob<Dtype>*>& bottom) {
  //對參數求偏導，top_diff*bottom_data=blobs_diff
  // 注意，此處(Dtype)1., this->blobs_[0]->mutable_gpu_diff()
  // 中的(Dtype)1.：使得在一個solver的iteration中的多個iter_size
  // 的梯度沒有清零，而得以累加
  if (this->param_propagate_down_[0]) {
    const Dtype* top_diff = top[0]->gpu_diff();
    const Dtype* bottom_data = bottom[0]->gpu_data();
    // Gradient with respect to weight
    caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
        top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_gpu_diff());
  }

  // 對偏置求偏導top_diff*bias=blobs_diff
  if (bias_term_ && this->param_propagate_down_[1]) {
    const Dtype* top_diff = top[0]->gpu_diff();
    // Gradient with respect to bias
    caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
        bias_multiplier_.gpu_data(), (Dtype)1.,
        this->blobs_[1]->mutable_gpu_diff());
  }

  //對上一層輸出求偏導top_diff*blobs_data=bottom_diff
  if (propagate_down[0]) {
    const Dtype* top_diff = top[0]->gpu_diff();
    // Gradient with respect to bottom data
    caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
        top_diff, this->blobs_[0]->gpu_data(), (Dtype)0.,
        bottom[0]->mutable_gpu_diff());
  }
}

第七層 ReLU

cpu代碼分析如下，注，該層沒有參數，只需對輸入求導

void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down,
    const vector<Blob<Dtype>*>& bottom) {
  if (propagate_down[0]) {
    const Dtype* bottom_data = bottom[0]->cpu_data();
    const Dtype* top_diff = top[0]->cpu_diff();
    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
    const int count = bottom[0]->count();

    //見公式推導
    Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
    for (int i = 0; i < count; ++i) {
      bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0)
          + negative_slope * (bottom_data[i] <= 0));
    }
  }
}

公式推導

第五層 Pooling

Maxpooling的cpu代碼分析如下，注，該層沒有參數，只需對輸入求導

void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {

  const Dtype* top_diff = top[0]->cpu_diff();
  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
  // bottom_diff初始化置0
  caffe_set(bottom[0]->count(), Dtype(0), bottom_diff);
  const int* mask = NULL;  // suppress warnings about uninitialized variables

  ...
    // 在前向計算時max_idx中保存了top_data中的點是有bottom_data中的點得來的在該feature map中的座標
    mask = max_idx_.cpu_data();
    // 主循環，按(N,C,H,W)方式便利top_data中每個點
    for (int n = 0; n < top[0]->num(); ++n) {
      for (int c = 0; c < channels_; ++c) {
        for (int ph = 0; ph < pooled_height_; ++ph) {
          for (int pw = 0; pw < pooled_width_; ++pw) {
            const int index = ph * pooled_width_ + pw;
            const int bottom_index = mask[index];
            // 見公式推導
            bottom_diff[bottom_index] += top_diff[index];
          }
        }
        bottom_diff += bottom[0]->offset(0, 1);
        top_diff += top[0]->offset(0, 1);
        mask += top[0]->offset(0, 1);

      }
    }

}

第四層 Convolution

void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  const Dtype* weight = this->blobs_[0]->cpu_data();
  Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
  for (int i = 0; i < top.size(); ++i) {
    const Dtype* top_diff = top[i]->cpu_diff();
    const Dtype* bottom_data = bottom[i]->cpu_data();
    Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
    // Bias gradient, if necessary.
    if (this->bias_term_ && this->param_propagate_down_[1]) {
      Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
      // 對於每個Batch中的樣本，計算偏置的偏導
      for (int n = 0; n < this->num_; ++n) {
        this->backward_cpu_bias(bias_diff, top_diff + n * this->top_dim_);
      }
    }
    if (this->param_propagate_down_[0] || propagate_down[i]) {
      // 對於每個Batch中的樣本,關於權值及輸入求導部分代碼展開了函數（非可運行代碼）
      for (int n = 0; n < this->num_; ++n) {

        // gradient w.r.t. weight. Note that we will accumulate diffs.
        //top_diff(50*64) * bottom_data(500*64,Transpose) = weight_diff(50*500)
        // 注意，此處(Dtype)1., this->blobs_[0]->mutable_gpu_diff()
        // 中的(Dtype)1.：使得在一個solver的iteration中的多個iter_size
        // 的梯度沒有清零，而得以累加
        caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
          kernel_dim_, conv_out_spatial_dim_,
          (Dtype)1., top_diff + n * this->top_dim_, bottom_data + n * this->bottom_dim_,
          (Dtype)1., weight_diff);

        // gradient w.r.t. bottom data, if necessary.
        // weight(50*500,Transpose) * top_diff(50*64) = bottom_diff(500*64)
        caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_,
          conv_out_spatial_dim_, conv_out_channels_ ,
          (Dtype)1., weight, top_diff + n * this->top_dim_,
          (Dtype)0., bottom_diff + n * this->bottom_dim_);

      }
    }
  }
}

第四層的bottom維度(N,C,H,W)=(64,20,12,12)，top的維度bottom維度(N,C,H,W)=(64,50,8,8),由於每個樣本單獨處理，所以只需要關注(C,H,W)的維度，分別爲(20,12,12)和(50,8,8)
根據（Caffe）卷積的實現，該層可以寫成矩陣相乘的形式Weight_data×Bottom_dataT=Top_data
Weight_data的維度爲Cout×(C∗K∗K)=50×500
Bottom_data的維度爲(H∗W)×(C∗K∗K)=64×500，64爲8∗8個卷積核的位置，500=C∗K∗K=20∗5∗5
Top_data的維度爲64×50
寫成矩陣表示後，從某種角度上與全連接從（也是表示成矩陣相乘）相同，因此，可以借鑑全連接層的推導。

【caffe源碼研究】第四章：完整案例源碼篇(5) ：LeNet反向過程

入口信息

第九層 SoftmaxWithLossLayer

第八層 InnerProduct

第七層 ReLU

第五層 Pooling

第四層 Convolution

【caffe源碼研究】第三章：源碼篇(5) ：Net

【caffe源碼研究】第四章：完整案例源碼篇(5) ：LeNet反向過程

【caffe源碼研究】第四章：完整案例源碼篇(2) ：LeNet初始化訓練網絡

【caffe源碼研究】第四章：完整案例源碼篇(1) ：LeNetSolver初始化

【caffe源碼研究】第三章：源碼篇(12) ：激活函數層

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結