本部分剖析Caffe中Net::Backward()函數(shù),即反向傳播計算過程饺藤。從LeNet網(wǎng)絡(luò)角度出發(fā)包斑,且調(diào)試網(wǎng)絡(luò)為訓練網(wǎng)絡(luò),共9層網(wǎng)絡(luò)涕俗。具體網(wǎng)絡(luò)層信息見 (Caffe罗丰,LeNet)初始化訓練網(wǎng)絡(luò)(三) 第2部分
1 入口信息
void Net<Dtype>::BackwardFromTo(int start, int end) {
for (int i = start; i >= end; --i) {
if (layer_need_backward_[i]) {
top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
if (debug_info_) { BackwardDebugInfo(i); }
2 第九層SoftmaxWithLossLayer
2.1 代碼分析
void SoftmaxWithLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
// bottom_diff shape:64*10
Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
// prob_data shape:64*10
const Dtype* prob_data = prob_.gpu_data();
// top_data shape:(1)
const Dtype* top_data = top[0]->gpu_data();
// 將Softmax層預測的結(jié)果prob復制到bottom_diff中
caffe_gpu_memcpy(prob_.count() * sizeof(Dtype), prob_data, bottom_diff);
// label shape:64*1
const Dtype* label = bottom[1]->gpu_data();
// dim = 640 / 64 = 10
const int dim = prob_.count() / outer_num_;
// nthreads = 64 / 1 = 64
const int nthreads = outer_num_ * inner_num_;
// Since this memory is never used for anything else,
// we use to to avoid allocating new GPU memory.
Dtype* counts = prob_.mutable_gpu_diff();
// 該函數(shù)將bottom_diff(此時為每個類的預測概率)對應的正確類別(label)的概率值-1元镀,其他數(shù)據(jù)沒變绍填。見公式推導。
CAFFE_CUDA_NUM_THREADS>>>(nthreads, top_data, label, bottom_diff,
outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
// 代碼展開開始,代碼有修改
__global__ void SoftmaxLossBackwardGPU(...) {
CUDA_KERNEL_LOOP(index, nthreads) {
const int label_value = static_cast<int>(label[index]);
bottom_diff[index * dim + label_value] -= 1;
counts[index] = 1;
// 代碼展開結(jié)束
Dtype valid_count = -1;
// 注意為loss的權(quán)值栖疑,對該權(quán)值(一般為1或者0)歸一化(除以64)
const Dtype loss_weight = top[0]->cpu_diff()[0] /
get_normalizer(normalization_, valid_count);
caffe_gpu_scal(prob_.count(), loss_weight , bottom_diff);
- SoftmaxWithLossLayer是沒有學習參數(shù)的(見前向計算(五)) 讨永,因此不需要對該層的參數(shù)做調(diào)整,只需要計算bottom_diff(理解反向傳播算法的鏈式求導遇革,求bottom_diff對上一層的輸出求導卿闹,是為了進一步計算調(diào)整上一層權(quán)值)
- 以上代碼核心部分在SoftmaxLossBackwardGPU。該函數(shù)將
2.2 公式推導
所以$\frac{d loss}{d\mathbf{z}}$結(jié)果如下:
\frac{\partial loss}{\partial z_i}=
\left {
& f(z_y)-1,z_i= z_y \
& f(z_i),z_i \ne z_y
$$ -
3 第八層InnerProduct
3.1 代碼分析
template <typename Dtype>
void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down,
const vector<Blob<Dtype>*>& bottom) {
if (this->param_propagate_down_[0]) {
const Dtype* top_diff = top[0]->gpu_diff();
const Dtype* bottom_data = bottom[0]->gpu_data();
// Gradient with respect to weight
caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_gpu_diff());
// 對偏置求偏導top_diff*bias=blobs_diff
if (bias_term_ && this->param_propagate_down_[1]) {
const Dtype* top_diff = top[0]->gpu_diff();
// Gradient with respect to bias
caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
bias_multiplier_.gpu_data(), (Dtype)1.,
if (propagate_down[0]) {
const Dtype* top_diff = top[0]->gpu_diff();
// Gradient with respect to bottom data
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1.,
top_diff, this->blobs_[0]->gpu_data(), (Dtype)0.,
3.2 公式推導
1. 對上一層輸出求偏導
$\frac{\partial loss}{\partial u_j}$存放在ip2層的bottom_blob_diff(64500)中奕谭,計算公式如下涣觉,其中$\frac{\partial loss}{\partial z_k}$存放在top_blob_diff(6410)中:
\frac{\partial z_k}{\partial u_j} = \frac{\sum_j^{100}{w_{kj}u_j}}{\partial u_j}=w_{kj}
\frac{\partial loss}{\partial u_j}=\sum_k^{n=10}{\frac{\partial loss}{\partial z_k}\frac{\partial z_k}{\partial u_j}}=\sum_k^{n=10}{\frac{\partial loss}{\partial z_k}w_{kj}}
\frac{\partial loss}{\partial u_j}=\frac{\partial loss}{\partial \mathbf{z^T}} \cdot \mathbf{w_{j}}
進一步,寫成矩陣的形式血柳,其中$\mathbf{u}$為500維官册,$\mathbf{z}$為10維,$\mathbf{W}$為$10 \times 500$:
\frac{\partial loss}{\partial \mathbf{u^T}}=\frac{\partial loss}{\partial \mathbf{z^T}} \cdot \mathbf{W}
再進一步难捌,考慮到一個batch有64個樣本膝宁,表達式可以寫成如下形式,其中$\mathbf{U}$為$64 \times 500$根吁;$\mathbf{Z}$為$64 \times 10$员淫;$\mathbf{W}$為$10 \times 500$:
\frac{\partial loss}{\partial \mathbf{U}}=\frac{\partial loss}{\partial \mathbf{Z}} \cdot \mathbf{W}
2. 對參數(shù)求偏導
\frac{\partial loss}{\partial w_{kj}}=\frac{\partial loss}{\partial z_k}\frac{\partial z_k}{\partial w_{kj}}=\frac{\partial loss}{\partial z_k} u_{j}
\frac{\partial loss}{\partial \mathbf{w_{j}}}=\frac{\partial loss}{\partial \mathbf{z}} u_{j}
進一步,可以寫成矩陣形式击敌,其中$\mathbf{W}$為$10 \times 500$介返;$\mathbf{z}$為10維;$\mathbf{u}$為500維沃斤。
\frac{\partial loss}{\partial \mathbf{W}}=\frac{\partial loss}{\partial \mathbf{z}} \mathbf{u^T}
再進一步圣蝎,考慮到一個batch有64個樣本,表達式可以寫成如下形式衡瓶,其中$\mathbf{W}$為$10 \times 500$徘公;$\mathbf{Z}$為$64 \times 10$;$\mathbf{U}$為$64 \times 500$:
\frac{\partial loss}{\partial \mathbf{W}}=\frac{\partial loss}{\partial \mathbf{Z^T}} \cdot \mathbf{U}
4 第七層ReLU
4.1 代碼分析
void ReLULayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down,
const vector<Blob<Dtype>*>& bottom) {
if (propagate_down[0]) {
const Dtype* bottom_data = bottom[0]->cpu_data();
const Dtype* top_diff = top[0]->cpu_diff();
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
const int count = bottom[0]->count();
Dtype negative_slope = this->layer_param_.relu_param().negative_slope();
for (int i = 0; i < count; ++i) {
bottom_diff[i] = top_diff[i] * ((bottom_data[i] > 0)
+ negative_slope * (bottom_data[i] <= 0));
4.2 公式推導
\left {
& bottom_data_i & bottom_data_i \gt 0 \
& bottom_data_i*slope & bottom_data_i \le 0
\right .
\frac{\partial loss}{\partial bottom_data_i}=\frac{\partial loss}{\partial top_data_i} \cdot \frac{\partial top_data_i}{\partial bottom_data_i} \
= \left {
& top_diff_i & bottom_data_i \gt 0\
& top_diff_i * slope & bottom_data_i \le 0
\right .
5 第五層Pooling
5.1 代碼分析
void PoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
const Dtype* top_diff = top[0]->cpu_diff();
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
// bottom_diff初始化置0
caffe_set(bottom[0]->count(), Dtype(0), bottom_diff);
const int* mask = NULL; // suppress warnings about uninitialized variables
// 在前向計算時max_idx中保存了top_data中的點是有bottom_data中的點得來的在該feature map中的坐標
mask = max_idx_.cpu_data();
// 主循環(huán)筛武,按(N,C,H,W)方式便利top_data中每個點
for (int n = 0; n < top[0]->num(); ++n) {
for (int c = 0; c < channels_; ++c) {
for (int ph = 0; ph < pooled_height_; ++ph) {
for (int pw = 0; pw < pooled_width_; ++pw) {
const int index = ph * pooled_width_ + pw;
const int bottom_index = mask[index];
// 見公式推導
bottom_diff[bottom_index] += top_diff[index];
bottom_diff += bottom[0]->offset(0, 1);
top_diff += top[0]->offset(0, 1);
mask += top[0]->offset(0, 1);
5.2 公式推導
bottom_diff_j = \frac{\partial loss}{\partial bottom_data_j}=\frac{\partial loss}{\partial top_data_i} \cdot \frac{\partial top_data_i}{\partial bottom_data_j} \= top_diff_i \cdot 1(注意下標)
6 第四層Convolution
void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
const Dtype* weight = this->blobs_[0]->cpu_data();
Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
for (int i = 0; i < top.size(); ++i) {
const Dtype* top_diff = top[i]->cpu_diff();
const Dtype* bottom_data = bottom[i]->cpu_data();
Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
// Bias gradient, if necessary.
if (this->bias_term_ && this->param_propagate_down_[1]) {
Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
// 對于每個Batch中的樣本徘六,計算偏執(zhí)的偏導
for (int n = 0; n < this->num_; ++n) {
this->backward_cpu_bias(bias_diff, top_diff + n * this->top_dim_);
if (this->param_propagate_down_[0] || propagate_down[i]) {
// 對于每個Batch中的樣本,關(guān)于權(quán)值及輸入求導部分代碼展開了函數(shù)(非可運行代碼)
for (int n = 0; n < this->num_; ++n) {
// gradient w.r.t. weight. Note that we will accumulate diffs.
//top_diff(50*64) * bottom_data(500*64,Transpose) = weight_diff(50*500)
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
kernel_dim_, conv_out_spatial_dim_,
(Dtype)1., top_diff + n * this->top_dim_, bottom_data + n * this->bottom_dim_,
(Dtype)1., weight_diff);
// gradient w.r.t. bottom data, if necessary.
// weight(50*500,Transpose) * top_diff(50*64) = bottom_diff(500*64)
caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_,
conv_out_spatial_dim_, conv_out_channels_ ,
(Dtype)1., weight, top_diff + n * this->top_dim_,
(Dtype)0., bottom_diff + n * this->bottom_dim_);
- 第四層的bottom維度$(N,C,H,W)=(64,20,12,12)$内边,top的維度bottom維度$(N,C,H,W)=(64,50,8,8)$,由于每個樣本單獨處理,所以只需要關(guān)注$(C,H,W)$的維度待锈,分別為$(20,12,12)$和$(50,8,8)$
- 根據(jù)(Caffe)卷積的實現(xiàn)漠其,該層可以寫成矩陣相乘的形式$Weight_data \times Bottom_data^T = Top_data$
- $Weight_data$的維度為$C_{out} \times (CKK)=50 \times 500$
- $Bottom_data$的維度為$(HW) \times (CKK)=64 \times 500$,$64$為$88$個卷積核的位置,$500=CKK=2055$
- $Top_data$的維度為$64 \times 50$
- 寫成矩陣表示后和屎,從某種角度上與全連接從(也是表示成矩陣相乘)相同拴驮,因此,可以借鑒全連接層的推導柴信。