From e7f543bb264ad8597a3edaf3b938e9c3cc57bf33 Mon Sep 17 00:00:00 2001
From: ih4cku <ih4cku@gmail.com>
Date: Wed, 17 Jun 2015 12:15:28 +0800
Subject: [PATCH 001/144] register a dummy reducer to prevent mincepie runtime
 error

---
 tools/extra/resize_and_crop_images.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/extra/resize_and_crop_images.py b/tools/extra/resize_and_crop_images.py
index c844f590c06..fd2c3134edb 100755
--- a/tools/extra/resize_and_crop_images.py
+++ b/tools/extra/resize_and_crop_images.py
@@ -101,7 +101,7 @@ def map(self, key, value):
         yield value, FLAGS.output_folder
 
 mapreducer.REGISTER_DEFAULT_MAPPER(ResizeCropImagesMapper)
-
+mapreducer.REGISTER_DEFAULT_REDUCER(mapreducer.NoPassReducer)
 mapreducer.REGISTER_DEFAULT_READER(mapreducer.FileReader)
 mapreducer.REGISTER_DEFAULT_WRITER(mapreducer.FileWriter)
  

From 10725393518df14b9b6976686f72fae792c3f393 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Mon, 5 Oct 2015 15:46:54 -0700
Subject: [PATCH 002/144] NetSpec: type-check Function inputs (they must be Top
 instances)

---
 python/caffe/net_spec.py           | 4 ++++
 python/caffe/test/test_net_spec.py | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/python/caffe/net_spec.py b/python/caffe/net_spec.py
index 93fc01927db..b6520627a4b 100644
--- a/python/caffe/net_spec.py
+++ b/python/caffe/net_spec.py
@@ -103,6 +103,10 @@ class Function(object):
 
     def __init__(self, type_name, inputs, params):
         self.type_name = type_name
+        for index, input in enumerate(inputs):
+            if not isinstance(input, Top):
+                raise TypeError('%s input %d is not a Top (type is %s)' %
+                                (type_name, index, type(input)))
         self.inputs = inputs
         self.params = params
         self.ntop = self.params.get('ntop', 1)
diff --git a/python/caffe/test/test_net_spec.py b/python/caffe/test/test_net_spec.py
index fee3c0aaebe..ffe71bacb08 100644
--- a/python/caffe/test/test_net_spec.py
+++ b/python/caffe/test/test_net_spec.py
@@ -79,3 +79,11 @@ def test_zero_tops(self):
         net_proto = silent_net()
         net = self.load_net(net_proto)
         self.assertEqual(len(net.forward()), 0)
+
+    def test_type_error(self):
+        """Test that a TypeError is raised when a Function input isn't a Top."""
+        data = L.DummyData(ntop=2)  # data is a 2-tuple of Tops
+        r = r"^Silence input 0 is not a Top \(type is <(type|class) 'tuple'>\)$"
+        with self.assertRaisesRegexp(TypeError, r):
+            L.Silence(data, ntop=0)  # should raise: data is a tuple, not a Top
+        L.Silence(*data, ntop=0)  # shouldn't raise: each elt of data is a Top

From 52dcf4801dddf05df3ddef238895cabbc6c4384a Mon Sep 17 00:00:00 2001
From: Azat <davletag@mail.ru>
Date: Thu, 3 Dec 2015 13:56:48 +0300
Subject: [PATCH 003/144] sigmoid fix (cu)

Previous implementation caused FP overflow for x less than -90
---
 src/caffe/layers/sigmoid_layer.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/layers/sigmoid_layer.cu b/src/caffe/layers/sigmoid_layer.cu
index 184c61ede83..8a4ea6616e0 100644
--- a/src/caffe/layers/sigmoid_layer.cu
+++ b/src/caffe/layers/sigmoid_layer.cu
@@ -8,7 +8,7 @@ namespace caffe {
 template <typename Dtype>
 __global__ void SigmoidForward(const int n, const Dtype* in, Dtype* out) {
   CUDA_KERNEL_LOOP(index, n) {
-    out[index] = 1. / (1. + exp(-in[index]));
+    out[index] = 0.5 * tanh(0.5 * in[index]) + 0.5;
   }
 }
 

From 0f61cc09467afa35835dc09617f1042e4f77c9fb Mon Sep 17 00:00:00 2001
From: Azat <davletag@mail.ru>
Date: Thu, 3 Dec 2015 14:00:08 +0300
Subject: [PATCH 004/144] sigmoid fix (cpp)

Previous implementation caused FP overflow for x less than -90
---
 src/caffe/layers/sigmoid_layer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
index 85fd9676812..f8aa769a174 100644
--- a/src/caffe/layers/sigmoid_layer.cpp
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -7,7 +7,7 @@ namespace caffe {
 
 template <typename Dtype>
 inline Dtype sigmoid(Dtype x) {
-  return 1. / (1. + exp(-x));
+  return 0.5 * tanh(0.5 * x) + 0.5;
 }
 
 template <typename Dtype>

From 337b07589f4e44761bdb9ef4c242f83ca40c9da5 Mon Sep 17 00:00:00 2001
From: shai <shai@magisto.com>
Date: Mon, 21 Mar 2016 09:08:02 +0200
Subject: [PATCH 005/144] upgrading InfogainLoss layer: (1) incorporating
 Softmax layer to make the gradeint computation robust, much like
 SoftmaxWithLoss layer (see: http://stackoverflow.com/a/34917052/1714410 for
 more information). (2) supporting loss along axis

---
 include/caffe/layers/infogain_loss_layer.hpp |  35 ++++
 src/caffe/layers/infogain_loss_layer.cpp     | 172 ++++++++++++++++---
 src/caffe/proto/caffe.proto                  |   1 +
 src/caffe/test/test_infogain_loss_layer.cpp  |  83 ++++++++-
 4 files changed, 257 insertions(+), 34 deletions(-)

diff --git a/include/caffe/layers/infogain_loss_layer.hpp b/include/caffe/layers/infogain_loss_layer.hpp
index 633f339a28e..edecde829ad 100644
--- a/include/caffe/layers/infogain_loss_layer.hpp
+++ b/include/caffe/layers/infogain_loss_layer.hpp
@@ -8,6 +8,7 @@
 #include "caffe/proto/caffe.pb.h"
 
 #include "caffe/layers/loss_layer.hpp"
+#include "caffe/layers/softmax_layer.hpp"
 
 namespace caffe {
 
@@ -60,6 +61,12 @@ class InfogainLossLayer : public LossLayer<Dtype> {
   virtual inline int MinBottomBlobs() const { return 2; }
   virtual inline int MaxBottomBlobs() const { return 3; }
 
+  // InfogainLossLayer computes softmax prob internally.
+  // optional second "top" outputs the softmax prob
+  virtual inline int ExactNumTopBlobs() const { return -1; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline int MaxTopBlobs() const { return 2; }
+
   virtual inline const char* type() const { return "InfogainLoss"; }
 
  protected:
@@ -102,7 +109,35 @@ class InfogainLossLayer : public LossLayer<Dtype> {
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
+  /// Read the normalization mode parameter and compute the normalizer based
+  /// on the blob size.  If normalization_mode is VALID, the count of valid
+  /// outputs will be read from valid_count, unless it is -1 in which case
+  /// all outputs are assumed to be valid.
+  virtual Dtype get_normalizer(
+      LossParameter_NormalizationMode normalization_mode, int valid_count);
+  /// fill sum_rows_H_ according to matrix H
+  virtual void sum_rows_of_H(const Blob<Dtype>* H);
+
+  /// The internal SoftmaxLayer used to map predictions to a distribution.
+  shared_ptr<Layer<Dtype> > softmax_layer_;
+  /// prob stores the output probability predictions from the SoftmaxLayer.
+  Blob<Dtype> prob_;
+  /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
+  vector<Blob<Dtype>*> softmax_bottom_vec_;
+  /// top vector holder used in call to the underlying SoftmaxLayer::Forward
+  vector<Blob<Dtype>*> softmax_top_vec_;
+
   Blob<Dtype> infogain_;
+  Blob<Dtype> sum_rows_H_;  // cache the row sums of H.
+
+  /// Whether to ignore instances with a certain label.
+  bool has_ignore_label_;
+  /// The label indicating that an instance should be ignored.
+  int ignore_label_;
+  /// How to normalize the output loss.
+  LossParameter_NormalizationMode normalization_;
+
+  int infogain_axis_, outer_num_, inner_num_, num_labels_;
 };
 
 }  // namespace caffe
diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp
index 624d3118124..3c3f460ec34 100644
--- a/src/caffe/layers/infogain_loss_layer.cpp
+++ b/src/caffe/layers/infogain_loss_layer.cpp
@@ -3,7 +3,8 @@
 #include <vector>
 
 #include "caffe/layers/infogain_loss_layer.hpp"
-#include "caffe/util/io.hpp"
+#include "caffe/util/io.hpp"  // for bolb reading of matrix H
+#include "caffe/util/math_functions.hpp"
 
 namespace caffe {
 
@@ -11,6 +12,31 @@ template <typename Dtype>
 void InfogainLossLayer<Dtype>::LayerSetUp(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   LossLayer<Dtype>::LayerSetUp(bottom, top);
+  // internal softmax layer
+  LayerParameter softmax_layer_param(this->layer_param_);
+  SoftmaxParameter* softmax_param = softmax_layer_param.mutable_softmax_param();
+  softmax_param->set_axis(this->layer_param_.infogain_loss_param().axis());
+  softmax_layer_param.set_type("Softmax");
+  softmax_layer_param.clear_loss_weight();
+  softmax_layer_param.add_loss_weight(1);
+  softmax_layer_ = LayerRegistry<Dtype>::CreateLayer(softmax_layer_param);
+  softmax_bottom_vec_.clear();
+  softmax_bottom_vec_.push_back(bottom[0]);
+  softmax_top_vec_.clear();
+  softmax_top_vec_.push_back(&prob_);
+  softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_);
+
+  // ignore label
+  has_ignore_label_ =
+    this->layer_param_.loss_param().has_ignore_label();
+  if (has_ignore_label_) {
+    ignore_label_ = this->layer_param_.loss_param().ignore_label();
+  }
+  // normalization
+  CHECK(!this->layer_param_.loss_param().has_normalize())
+    << "normalize is deprecated. use \"normalization\"";
+  normalization_ = this->layer_param_.loss_param().normalization();
+  // matrix H
   if (bottom.size() < 3) {
     CHECK(this->layer_param_.infogain_loss_param().has_source())
         << "Infogain matrix source must be specified.";
@@ -25,28 +51,86 @@ template <typename Dtype>
 void InfogainLossLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   LossLayer<Dtype>::Reshape(bottom, top);
+  softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_);
+  infogain_axis_ =
+    bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.infogain_loss_param().axis());
+  outer_num_ = bottom[0]->count(0, infogain_axis_);
+  inner_num_ = bottom[0]->count(infogain_axis_ + 1);
+  CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
+      << "Number of labels must match number of predictions; "
+      << "e.g., if infogain axis == 1 and prediction shape is (N, C, H, W), "
+      << "label count (number of labels) must be N*H*W, "
+      << "with integer values in {0, 1, ..., C-1}.";
+  num_labels_ = bottom[0]->shape(infogain_axis_);
   Blob<Dtype>* infogain = NULL;
   if (bottom.size() < 3) {
     infogain = &infogain_;
   } else {
     infogain = bottom[2];
   }
-  CHECK_EQ(bottom[1]->channels(), 1);
-  CHECK_EQ(bottom[1]->height(), 1);
-  CHECK_EQ(bottom[1]->width(), 1);
-  const int num = bottom[0]->num();
-  const int dim = bottom[0]->count() / num;
-  CHECK_EQ(infogain->num(), 1);
-  CHECK_EQ(infogain->channels(), 1);
-  CHECK_EQ(infogain->height(), dim);
-  CHECK_EQ(infogain->width(), dim);
+  CHECK_EQ(infogain->count(), num_labels_*num_labels_);
+  sum_rows_H_.Reshape(vector<int>(1, num_labels_));
+  if (bottom.size() == 2) {
+    // H is provided as a parameter and will not change. sum rows once
+    sum_rows_of_H(infogain);
+  }
+  if (top.size() >= 2) {
+    // softmax output
+    top[1]->ReshapeLike(*bottom[0]);
+  }
+}
+
+template <typename Dtype>
+Dtype InfogainLossLayer<Dtype>::get_normalizer(
+    LossParameter_NormalizationMode normalization_mode, int valid_count) {
+  Dtype normalizer;
+  switch (normalization_mode) {
+    case LossParameter_NormalizationMode_FULL:
+      normalizer = Dtype(outer_num_ * inner_num_);
+      break;
+    case LossParameter_NormalizationMode_VALID:
+      if (valid_count == -1) {
+        normalizer = Dtype(outer_num_ * inner_num_);
+      } else {
+        normalizer = Dtype(valid_count);
+      }
+      break;
+    case LossParameter_NormalizationMode_BATCH_SIZE:
+      normalizer = Dtype(outer_num_);
+      break;
+    case LossParameter_NormalizationMode_NONE:
+      normalizer = Dtype(1);
+      break;
+    default:
+      LOG(FATAL) << "Unknown normalization mode: "
+          << LossParameter_NormalizationMode_Name(normalization_mode);
+  }
+  // Some users will have no labels for some examples in order to 'turn off' a
+  // particular loss in a multi-task setup. The max prevents NaNs in that case.
+  return std::max(Dtype(1.0), normalizer);
 }
 
+template <typename Dtype>
+void InfogainLossLayer<Dtype>::sum_rows_of_H(const Blob<Dtype>* H) {
+  CHECK_EQ(H->count(), num_labels_*num_labels_)
+    << "H must be " << num_labels_ << "x" << num_labels_;
+  const Dtype* infogain_mat = H->cpu_data();
+  Dtype* sum = sum_rows_H_.mutable_cpu_data();
+  for ( int row = 0; row < num_labels_ ; row++ ) {
+    sum[row] = 0;
+    for ( int col = 0; col < num_labels_ ; col++ ) {
+      sum[row] += infogain_mat[row*num_labels_+col];
+    }
+  }
+}
 
 template <typename Dtype>
 void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
+  // The forward pass computes the softmax prob values.
+  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
+  const Dtype* prob_data = prob_.cpu_data();
   const Dtype* bottom_label = bottom[1]->cpu_data();
   const Dtype* infogain_mat = NULL;
   if (bottom.size() < 3) {
@@ -54,17 +138,30 @@ void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   } else {
     infogain_mat = bottom[2]->cpu_data();
   }
-  int num = bottom[0]->num();
-  int dim = bottom[0]->count() / bottom[0]->num();
+  int count = 0;
   Dtype loss = 0;
-  for (int i = 0; i < num; ++i) {
-    int label = static_cast<int>(bottom_label[i]);
-    for (int j = 0; j < dim; ++j) {
-      Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
-      loss -= infogain_mat[label * dim + j] * log(prob);
+  for (int i = 0; i < outer_num_; ++i) {
+    for (int j = 0; j < inner_num_; j++) {
+      const int label_value =
+        static_cast<int>(bottom_label[i * inner_num_ + j]);
+      if (has_ignore_label_ && label_value == ignore_label_) {
+        continue;
+      }
+      DCHECK_GE(label_value, 0);
+      DCHECK_LT(label_value, num_labels_);
+      for (int l = 0; l < num_labels_; l++) {
+        loss -= infogain_mat[label_value * num_labels_ + l] *
+          log(std::max(
+                prob_data[i * inner_num_*num_labels_ + l * inner_num_ + j],
+                Dtype(kLOG_THRESHOLD)));
+      }
+      ++count;
     }
   }
-  top[0]->mutable_cpu_data()[0] = loss / num;
+  top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count);
+  if (top.size() == 2) {
+    top[1]->ShareData(prob_);
+  }
 }
 
 template <typename Dtype>
@@ -80,25 +177,44 @@ void InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
                << " Layer cannot backpropagate to infogain inputs.";
   }
   if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->cpu_data();
+    const Dtype* prob_data = prob_.cpu_data();
     const Dtype* bottom_label = bottom[1]->cpu_data();
     const Dtype* infogain_mat = NULL;
     if (bottom.size() < 3) {
       infogain_mat = infogain_.cpu_data();
     } else {
       infogain_mat = bottom[2]->cpu_data();
+      // H is provided as a "bottom" and might change. sum rows every time.
+      sum_rows_of_H(bottom[2]);
     }
+    const Dtype* sum_rows_H = sum_rows_H_.cpu_data();
     Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    int num = bottom[0]->num();
-    int dim = bottom[0]->count() / bottom[0]->num();
-    const Dtype scale = - top[0]->cpu_diff()[0] / num;
-    for (int i = 0; i < num; ++i) {
-      const int label = static_cast<int>(bottom_label[i]);
-      for (int j = 0; j < dim; ++j) {
-        Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
-        bottom_diff[i * dim + j] = scale * infogain_mat[label * dim + j] / prob;
+    const int dim = bottom[0]->count() / outer_num_;
+    int count = 0;
+    for (int i = 0; i < outer_num_; ++i) {
+      for (int j = 0; j < inner_num_; ++j) {
+        const int label_value =
+          static_cast<int>(bottom_label[i * inner_num_ + j]);
+        DCHECK_GE(label_value, 0);
+        DCHECK_LT(label_value, num_labels_);
+        if (has_ignore_label_ && label_value == ignore_label_) {
+          for (int l = 0; l < num_labels_; ++l) {
+            bottom_diff[i * dim + l * inner_num_ + j] = 0;
+          }
+        } else {
+          for (int l = 0; l < num_labels_; ++l) {
+            bottom_diff[i * dim + l * inner_num_ + j] =
+               prob_data[i*dim + l*inner_num_ + j]*sum_rows_H[label_value]
+               - infogain_mat[label_value * num_labels_ + l];
+          }
+          ++count;
+        }
       }
     }
+    // Scale gradient
+    Dtype loss_weight = top[0]->cpu_diff()[0] /
+                        get_normalizer(normalization_, count);
+    caffe_scal(bottom[0]->count(), loss_weight, bottom_diff);
   }
 }
 
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 6900bb71482..591e9647258 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -794,6 +794,7 @@ message ImageDataParameter {
 message InfogainLossParameter {
   // Specify the infogain matrix source.
   optional string source = 1;
+  optional int32 axis = 2 [default = 1]; // axis of prob
 }
 
 message InnerProductParameter {
diff --git a/src/caffe/test/test_infogain_loss_layer.cpp b/src/caffe/test/test_infogain_loss_layer.cpp
index a24ac683dc5..34f21271a62 100644
--- a/src/caffe/test/test_infogain_loss_layer.cpp
+++ b/src/caffe/test/test_infogain_loss_layer.cpp
@@ -1,3 +1,4 @@
+#include <algorithm>
 #include <vector>
 
 #include "gtest/gtest.h"
@@ -18,17 +19,22 @@ class InfogainLossLayerTest : public MultiDeviceTest<TypeParam> {
 
  protected:
   InfogainLossLayerTest()
-      : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
-        blob_bottom_label_(new Blob<Dtype>(10, 1, 1, 1)),
+      : blob_bottom_data_(new Blob<Dtype>(4, 2, 5, 2)),
+        blob_bottom_label_(new Blob<Dtype>(4, 2, 1, 2)),
         blob_bottom_infogain_(new Blob<Dtype>(1, 1, 5, 5)),
-        blob_top_loss_(new Blob<Dtype>()) {
+        blob_top_loss_(new Blob<Dtype>()),
+        blob_top_prob_(new Blob<Dtype>()),
+        inner_(2), outer_(4*2), num_labels_(5) {
     Caffe::set_random_seed(1701);
     FillerParameter filler_param;
-    PositiveUnitballFiller<Dtype> filler(filler_param);
+    filler_param.set_min(-0.5);
+    filler_param.set_max(2.0);
+    UniformFiller<Dtype> filler(filler_param);
     filler.Fill(this->blob_bottom_data_);
     blob_bottom_vec_.push_back(blob_bottom_data_);
     for (int i = 0; i < blob_bottom_label_->count(); ++i) {
-      blob_bottom_label_->mutable_cpu_data()[i] = caffe_rng_rand() % 5;
+      blob_bottom_label_->mutable_cpu_data()[i] =
+        caffe_rng_rand() % num_labels_;
     }
     blob_bottom_vec_.push_back(blob_bottom_label_);
     filler_param.set_min(0.1);
@@ -37,29 +43,94 @@ class InfogainLossLayerTest : public MultiDeviceTest<TypeParam> {
     infogain_filler.Fill(this->blob_bottom_infogain_);
     blob_bottom_vec_.push_back(blob_bottom_infogain_);
     blob_top_vec_.push_back(blob_top_loss_);
+    blob_top_vec_.push_back(blob_top_prob_);
   }
   virtual ~InfogainLossLayerTest() {
     delete blob_bottom_data_;
     delete blob_bottom_label_;
     delete blob_bottom_infogain_;
     delete blob_top_loss_;
+    delete blob_top_prob_;
   }
   Blob<Dtype>* const blob_bottom_data_;
   Blob<Dtype>* const blob_bottom_label_;
   Blob<Dtype>* const blob_bottom_infogain_;
   Blob<Dtype>* const blob_top_loss_;
+  Blob<Dtype>* const blob_top_prob_;
   vector<Blob<Dtype>*> blob_bottom_vec_;
   vector<Blob<Dtype>*> blob_top_vec_;
+  int inner_, outer_, num_labels_;
 };
 
 TYPED_TEST_CASE(InfogainLossLayerTest, TestDtypesAndDevices);
 
+TYPED_TEST(InfogainLossLayerTest, TestInfogainLoss) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_infogain_loss_param()->set_axis(2);
+  layer_param.clear_loss_weight();
+  layer_param.add_loss_weight(1);
+  layer_param.add_loss_weight(0);
+  /*vector<float>* lw = layer_param.mutable_loss_weight();
+  lw->clear();
+  lw->push_back(1);
+  lw->push_back(1);*/
+  InfogainLossLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  const Dtype* data = this->blob_bottom_vec_[0]->cpu_data();
+  const Dtype* prob = this->blob_top_vec_[1]->cpu_data();
+  const Dtype* labels = this->blob_bottom_vec_[1]->cpu_data();
+  const Dtype* H = this->blob_bottom_vec_[2]->cpu_data();
+  // first. test the prob top
+  CHECK_EQ(this->blob_bottom_vec_[0]->num_axes(),
+    this->blob_top_vec_[1]->num_axes())
+      << "prob top shape not match bottom data";
+  for (int ai = 0 ; ai < this->blob_bottom_vec_[0]->num_axes(); ai++) {
+    CHECK_EQ(this->blob_bottom_vec_[0]->shape(ai),
+      this->blob_top_vec_[1]->shape(ai))
+        << "prob top shape not match bottom data";
+  }
+  vector<Dtype> est_prob(this->num_labels_, 0);
+  for ( int i = 0 ; i < this->outer_; i++ ) {
+    for ( int j = 0; j < this->inner_; j++ ) {
+      Dtype den = 0;
+      for ( int  l = 0; l < this->num_labels_; l++ ) {
+        est_prob[l] = std::exp(
+          data[i*this->num_labels_*this->inner_ + l*this->inner_ + j]);
+        den += est_prob[l];
+      }
+      for ( int l = 0; l < this->num_labels_; l++ ) {
+        EXPECT_NEAR(prob[i*this->num_labels_*this->inner_ + l*this->inner_ + j],
+          est_prob[l]/den, 1e-6);
+      }
+    }
+  }
+  Dtype loss = 0;  // loss from prob top
+  for ( int i = 0 ; i < this->outer_; i++ ) {
+    for ( int j = 0; j < this->inner_; j++ ) {
+      int gt = static_cast<int>(labels[i*this->inner_+j]);
+      for ( int l = 0; l < this->num_labels_; l++ ) {
+        loss -= H[gt*this->num_labels_ + l] *
+          log(std::max(
+            prob[i*this->num_labels_*this->inner_ + l*this->inner_ + j],
+            Dtype(kLOG_THRESHOLD)));
+      }
+    }
+  }
+  EXPECT_NEAR(this->blob_top_loss_->cpu_data()[0],
+    loss/(this->outer_*this->inner_), 1e-6);
+}
 
 TYPED_TEST(InfogainLossLayerTest, TestGradient) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
+  layer_param.mutable_infogain_loss_param()->set_axis(2);
   InfogainLossLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-4, 2e-2, 1701, 1, 0.01);
+  this->blob_top_vec_.clear();  // ignore prob top.
+  this->blob_top_vec_.push_back(this->blob_top_loss_);
+  GradientChecker<Dtype> checker(1e-4, 2e-2, 1701);  // no "kink"
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
       this->blob_top_vec_, 0);
 }

From a66bea30d6c0706f106b355c7cafc9e7ffae7bb5 Mon Sep 17 00:00:00 2001
From: An Tran <tranlaman@gmail.com>
Date: Wed, 30 Mar 2016 17:32:10 +0800
Subject: [PATCH 006/144] small bug in pooling_layer.cu

---
 src/caffe/layers/pooling_layer.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/pooling_layer.cu
index 1ea46cc81b1..81ead1e8686 100644
--- a/src/caffe/layers/pooling_layer.cu
+++ b/src/caffe/layers/pooling_layer.cu
@@ -138,7 +138,7 @@ __global__ void StoPoolForwardTest(const int nthreads,
     const int wstart = pw * stride_w;
     const int wend = min(wstart + kernel_w, width);
     // We set cumsum to be 0 to avoid divide-by-zero problems
-    Dtype cumsum = FLT_MIN;
+    Dtype cumsum = 0.;
     Dtype cumvalues = 0.;
     const Dtype* const bottom_slice =
         bottom_data + (n * channels + c) * height * width;

From d17fbea6aad122c3818d5ef3593487869948b4b7 Mon Sep 17 00:00:00 2001
From: An Tran <tranlaman@gmail.com>
Date: Thu, 31 Mar 2016 10:27:31 +0800
Subject: [PATCH 007/144] avoid divide by zeros, suggested by SeanBell

---
 src/caffe/layers/pooling_layer.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/pooling_layer.cu
index 81ead1e8686..46eddb94924 100644
--- a/src/caffe/layers/pooling_layer.cu
+++ b/src/caffe/layers/pooling_layer.cu
@@ -149,7 +149,7 @@ __global__ void StoPoolForwardTest(const int nthreads,
         cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
       }
     }
-    top_data[index] = cumvalues / cumsum;
+    top_data[index] = (cumsum > 0.) ? cumvalues / cumsum : 0.;
   }
 }
 

From d4e7c93a6873f75a53d7618e82343e4b5b8a239e Mon Sep 17 00:00:00 2001
From: Aaron Schumacher <ajschumacher@gmail.com>
Date: Thu, 19 May 2016 14:04:22 -0500
Subject: [PATCH 008/144] convert non-uint8 dtypes to float; refs #2391

As recommended by @longjon, this will allow `caffe.io.array_to_datum` to handle, for example, numpy.float32 arrays.

It might be worth noting that `datum.float_data` is stored as protobuf type 2, which is float32, as opposed to protobuf type 1, which is float64. It is a little unintuitive that caffe currently requires data to be passed in as float64 but then writes float32 to LMDB. To demonstrate this:

```python
datum = caffe.io.array_to_datum(np.array([[[0.9]]]))
caffe.io.datum_to_array(datum)
# array([[[ 0.9]]])
datum_str = datum.SerializeToString()
new_datum = caffe.proto.caffe_pb2.Datum()
new_datum.ParseFromString(datum_str)
caffe.io.datum_to_array(new_datum)
# array([[[ 0.89999998]]])
```

This behavior is somewhat hidden because `datum_to_array` returns type float64, even though the data doesn't actually have that resolution if it has been stored as protobuf text anywhere (for example in LMDB).

Alternative solutions:
 * Require and return float32, consistent with the protobuf representation.
 * Change the protobuf to allow float32 or float64 and update surrounding code to support this.
---
 python/caffe/io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/caffe/io.py b/python/caffe/io.py
index e1759beb587..966c164cffd 100644
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -75,7 +75,7 @@ def array_to_datum(arr, label=None):
     if arr.dtype == np.uint8:
         datum.data = arr.tostring()
     else:
-        datum.float_data.extend(arr.flat)
+        datum.float_data.extend(arr.astype(float).flat)
     if label is not None:
         datum.label = label
     return datum

From 5d7a71ae108f86c05bc03eb542155b30bd28ca74 Mon Sep 17 00:00:00 2001
From: Lumin Zhou <CDLuminate@users.noreply.github.com>
Date: Mon, 30 May 2016 04:19:16 +0000
Subject: [PATCH 009/144] using GNUInstallDirs in root cmake file

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index da7142c9b3c..c765889e99c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,6 +18,7 @@ add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
 include(ExternalProject)
+include(GNUInstallDirs)
 
 include(cmake/Utils.cmake)
 include(cmake/Targets.cmake)

From 90b98ce76fe8613d345932f47a6250dc772f7b8f Mon Sep 17 00:00:00 2001
From: Lumin Zhou <CDLuminate@users.noreply.github.com>
Date: Mon, 30 May 2016 04:21:27 +0000
Subject: [PATCH 010/144] fix install path with GNUInstallDir support

---
 src/caffe/CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
index 8a80c940488..5a1b73f7493 100644
--- a/src/caffe/CMakeLists.txt
+++ b/src/caffe/CMakeLists.txt
@@ -29,9 +29,9 @@ set_target_properties(caffe PROPERTIES
  add_subdirectory(test)
 
 # ---[ Install
-install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION include)
-install(FILES ${proto_hdrs} DESTINATION include/caffe/proto)
-install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION lib)
+install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(FILES ${proto_hdrs} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/caffe/proto)
+install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
 file(WRITE ${PROJECT_BINARY_DIR}/__init__.py)
 list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py)

From 581650b18d7580df726d1d6d54d83c397d1379bb Mon Sep 17 00:00:00 2001
From: Lumin Zhou <CDLuminate@users.noreply.github.com>
Date: Mon, 30 May 2016 04:22:42 +0000
Subject: [PATCH 011/144] fix install path with GNUInstallDir support

---
 tools/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 02fbd5cadd8..3789450555e 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -25,5 +25,6 @@ foreach(source ${srcs})
   endif()
 
   # Install
-  install(TARGETS ${name} DESTINATION bin)
+  install(TARGETS ${name} DESTINATION ${CMAKE_INSTALL_BINDIR})
+
 endforeach(source)

From f710ef5e89d3ec22891b24099c66b7a6e9f06c45 Mon Sep 17 00:00:00 2001
From: Lumin Zhou <CDLuminate@users.noreply.github.com>
Date: Mon, 30 May 2016 04:24:13 +0000
Subject: [PATCH 012/144] fix install path with GNUInstallDir support

---
 examples/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 663d7360b7d..2a2300332ad 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -19,7 +19,8 @@ foreach(source_file ${examples_srcs})
   caffe_set_solution_folder(${name} examples)
 
   # install
-  install(TARGETS ${name} DESTINATION bin)
+  install(TARGETS ${name} DESTINATION ${CMAKE_INSTALL_BINDIR})
+
 
   if(UNIX OR APPLE)
     # Funny command to make tutorials work

From 9376bde1beba649e4c522b742064223ac9d2cab4 Mon Sep 17 00:00:00 2001
From: jasjuang <jasjuang@gmail.com>
Date: Thu, 21 Jul 2016 12:04:41 -0700
Subject: [PATCH 013/144] add in sudo make uninstall for cmake

---
 CMakeLists.txt           | 11 +++++++++++
 cmake/Uninstall.cmake.in | 26 ++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100644 cmake/Uninstall.cmake.in

diff --git a/CMakeLists.txt b/CMakeLists.txt
index da7142c9b3c..7b8dab2bb24 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,8 +85,19 @@ if(BUILD_python)
   add_dependencies(pytest pycaffe)
 endif()
 
+# ---[ uninstall target
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Uninstall.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/cmake/Uninstall.cmake
+    IMMEDIATE @ONLY)
+
+add_custom_target(uninstall
+    COMMAND ${CMAKE_COMMAND} -P
+    ${CMAKE_CURRENT_BINARY_DIR}/cmake/Uninstall.cmake)
+
 # ---[ Configuration summary
 caffe_print_configuration_summary()
 
 # ---[ Export configs generation
 caffe_generate_export_configs()
+
diff --git a/cmake/Uninstall.cmake.in b/cmake/Uninstall.cmake.in
new file mode 100644
index 00000000000..bb8e2964e46
--- /dev/null
+++ b/cmake/Uninstall.cmake.in
@@ -0,0 +1,26 @@
+if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+  message(FATAL_ERROR "Cannot find install manifest: @CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+
+if (NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set (CMAKE_INSTALL_PREFIX "@CMAKE_INSTALL_PREFIX@")
+endif ()
+ message(${CMAKE_INSTALL_PREFIX})
+
+file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
+string(REGEX REPLACE "\n" ";" files "${files}")
+foreach(file ${files})
+  message(STATUS "Uninstalling $ENV{DESTDIR}${file}")
+  if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    exec_program(
+      "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\""
+      OUTPUT_VARIABLE rm_out
+      RETURN_VALUE rm_retval
+      )
+    if(NOT "${rm_retval}" STREQUAL 0)
+      message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}")
+    endif(NOT "${rm_retval}" STREQUAL 0)
+  else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    message(STATUS "File $ENV{DESTDIR}${file} does not exist.")
+  endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+endforeach(file)
\ No newline at end of file

From d607858b90b645d8177c3970d782f0ab5c529558 Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Tue, 9 Aug 2016 15:13:47 +0000
Subject: [PATCH 014/144] Fix more float comparison precision issue

With reference to this commit:
f1a8470aa21e35a5b2bb83007f8fb7680a354815

This fix changes some EXPECT_EQ into EXPECT_FLOAT_EQ .
---
 src/caffe/test/test_convolution_layer.cpp     | 2 +-
 src/caffe/test/test_gradient_based_solver.cpp | 8 ++++----
 src/caffe/test/test_neuron_layer.cpp          | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp
index 9bb19d13592..85c10a29483 100644
--- a/src/caffe/test/test_convolution_layer.cpp
+++ b/src/caffe/test/test_convolution_layer.cpp
@@ -695,7 +695,7 @@ TYPED_TEST(ConvolutionLayerTest, TestNDAgainst2D) {
   }
   ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count());
   for (int i = 0; i < backward_result_2d.count(); ++i) {
-    EXPECT_EQ(backward_result_2d.cpu_diff()[i],
+    EXPECT_FLOAT_EQ(backward_result_2d.cpu_diff()[i],
               backward_result_nd.cpu_diff()[i]);
   }
   ASSERT_EQ(backward_weight_result_nd.count(),
diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp
index 975a8f0f88a..9395f4e95c6 100644
--- a/src/caffe/test/test_gradient_based_solver.cpp
+++ b/src/caffe/test/test_gradient_based_solver.cpp
@@ -538,9 +538,9 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const vector<Blob<Dtype>*>& params = solver_->net()->learnable_params();
     for (int i = 0; i < params.size(); ++i) {
       for (int j = 0; j < params[i]->count(); ++j) {
-        EXPECT_EQ(param_copies[i]->cpu_data()[j], params[i]->cpu_data()[j])
+        EXPECT_FLOAT_EQ(param_copies[i]->cpu_data()[j], params[i]->cpu_data()[j])
             << "param " << i << " data differed at dim " << j;
-        EXPECT_EQ(param_copies[i]->cpu_diff()[j], params[i]->cpu_diff()[j])
+        EXPECT_FLOAT_EQ(param_copies[i]->cpu_diff()[j], params[i]->cpu_diff()[j])
             << "param " << i << " diff differed at dim " << j;
       }
     }
@@ -549,9 +549,9 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const vector<shared_ptr<Blob<Dtype> > >& history = solver_->history();
     for (int i = 0; i < history.size(); ++i) {
       for (int j = 0; j < history[i]->count(); ++j) {
-        EXPECT_EQ(history_copies[i]->cpu_data()[j], history[i]->cpu_data()[j])
+        EXPECT_FLOAT_EQ(history_copies[i]->cpu_data()[j], history[i]->cpu_data()[j])
             << "history blob " << i << " data differed at dim " << j;
-        EXPECT_EQ(history_copies[i]->cpu_diff()[j], history[i]->cpu_diff()[j])
+        EXPECT_FLOAT_EQ(history_copies[i]->cpu_diff()[j], history[i]->cpu_diff()[j])
             << "history blob " << i << " diff differed at dim " << j;
       }
     }
diff --git a/src/caffe/test/test_neuron_layer.cpp b/src/caffe/test/test_neuron_layer.cpp
index 342f825cec3..57bd47b3a2e 100644
--- a/src/caffe/test/test_neuron_layer.cpp
+++ b/src/caffe/test/test_neuron_layer.cpp
@@ -791,16 +791,16 @@ TYPED_TEST(NeuronLayerTest, TestPReLUInPlace) {
   ip2.Backward(blob_middle_vec_2, propagate_down, blob_bottom_vec_2);
   // Check numbers
   for (int s = 0; s < blob_bottom_2->count(); ++s) {
-    EXPECT_EQ(this->blob_bottom_->cpu_diff()[s], blob_bottom_2->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(this->blob_bottom_->cpu_diff()[s], blob_bottom_2->cpu_diff()[s]);
   }
   for (int s = 0; s < ip.blobs()[0]->count(); ++s) {
-    EXPECT_EQ(ip.blobs()[0]->cpu_diff()[s], ip2.blobs()[0]->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(ip.blobs()[0]->cpu_diff()[s], ip2.blobs()[0]->cpu_diff()[s]);
   }
   for (int s = 0; s < ip.blobs()[1]->count(); ++s) {
-    EXPECT_EQ(ip.blobs()[1]->cpu_diff()[s], ip2.blobs()[1]->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(ip.blobs()[1]->cpu_diff()[s], ip2.blobs()[1]->cpu_diff()[s]);
   }
   for (int s = 0; s < prelu.blobs()[0]->count(); ++s) {
-    EXPECT_EQ(prelu.blobs()[0]->cpu_diff()[s],
+    EXPECT_FLOAT_EQ(prelu.blobs()[0]->cpu_diff()[s],
         prelu2.blobs()[0]->cpu_diff()[s]);
   }
 }

From 42d20fe21eeb8067b09ef5e935bb4c235dbf9f3f Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Wed, 10 Aug 2016 14:36:33 +0000
Subject: [PATCH 015/144] Import bash completion script for caffe from Debian
 Package.

Imported from Debian Package caffe (1.0.0~rc3+20160715-g42cd785-2).
---
 scripts/caffe | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 scripts/caffe

diff --git a/scripts/caffe b/scripts/caffe
new file mode 100644
index 00000000000..8a0b22af6ac
--- /dev/null
+++ b/scripts/caffe
@@ -0,0 +1,73 @@
+# bash completion for Caffe's command line utility       -*- shell-script -*-
+# COPYRIGHT (C) 2015,2016 Zhou Mo <cdluminate@gmail.com>
+# License: BSD-2-Clause
+# Originally appeard at https://github.com/BVLC/caffe/issues/3149
+
+# Updated for caffe (1.0.0~rc3+20160715-g42cd785)
+_caffe()
+{
+  local cur prev words cword
+  _init_completion -s || return
+
+  local prototxts='@(prototxt)'
+  local caffemodels='@(caffemodel,binaryproto)'
+  local solverstates='@(solverstate)'
+  local caffefiles='@(prototxt|caffemodel|solverstate)'
+
+  local flags='-gpu -iterations -model -snapshot -solver -weights -sighup_effect -sigint_effect -level -stage -phase'
+  
+  if [[ $cword -eq 1 ]]; then
+    COMPREPLY=( $( compgen -W 'train test time device_query' -- "$cur" ) )
+    return 0
+  fi
+  
+  if [[ $cword -eq 2 ]]; then
+    case ${words[1]} in
+    train|test|device_query|time)
+      COMPREPLY=( $( compgen -W "$flags" -- "$cur") )
+      return 0
+      ;;
+    *)
+      return 0
+      ;;
+    esac
+  fi
+
+  case $prev in
+  -gpu|-iterations|-version|-level|-stage)
+    return 0
+    ;;
+  -solver|-model)
+    _filedir $prototxts
+    return 0
+    ;;
+  -weights)
+    _filedir $caffemodels
+    return 0
+    ;;
+  -snapshot)
+    _filedir $solverstates
+    return 0
+    ;;
+  -sighup_effect|-sigint_effect)
+    COMPREPLY=( $( compgen -W 'snapshot stop none' -- "$cur") )
+    return 0
+    ;;
+  -phase)
+    COMPREPLY=( $( compgen -W 'TRAIN TEST' -- "$cur") )
+    return 0
+    ;;
+  *)
+    COMPREPLY=( $( compgen -W "$flags" -- "$cur") )
+    return 0
+    ;;
+  esac
+
+  # file completion on relevant files
+  _filedir "$caffefiles"
+
+  return 0
+}
+complete -F _caffe caffe
+
+# vim

From 6382d67da1d2b5d9ebe92df8a20a8ac1947366ea Mon Sep 17 00:00:00 2001
From: An Tran <tranlaman@gmail.com>
Date: Fri, 12 Aug 2016 16:39:11 +0800
Subject: [PATCH 016/144] small improments in compute_image_mean

---
 tools/compute_image_mean.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/compute_image_mean.cpp b/tools/compute_image_mean.cpp
index 2035d515195..417f5e4c622 100644
--- a/tools/compute_image_mean.cpp
+++ b/tools/compute_image_mean.cpp
@@ -22,9 +22,11 @@ DEFINE_string(backend, "lmdb",
         "The backend {leveldb, lmdb} containing the images");
 
 int main(int argc, char** argv) {
+#ifdef USE_OPENCV
   ::google::InitGoogleLogging(argv[0]);
+  // Print output to stderr (while still logging)
+  FLAGS_alsologtostderr = 1;
 
-#ifdef USE_OPENCV
 #ifndef GFLAGS_GFLAGS_H_
   namespace gflags = google;
 #endif
@@ -65,7 +67,7 @@ int main(int argc, char** argv) {
   for (int i = 0; i < size_in_datum; ++i) {
     sum_blob.add_data(0.);
   }
-  LOG(INFO) << "Starting Iteration";
+  LOG(INFO) << "Starting iteration";
   while (cursor->valid()) {
     Datum datum;
     datum.ParseFromString(cursor->value());
@@ -114,7 +116,7 @@ int main(int argc, char** argv) {
     for (int i = 0; i < dim; ++i) {
       mean_values[c] += sum_blob.data(dim * c + i);
     }
-    LOG(INFO) << "mean_value channel [" << c << "]:" << mean_values[c] / dim;
+    LOG(INFO) << "mean_value channel [" << c << "]: " << mean_values[c] / dim;
   }
 #else
   LOG(FATAL) << "This tool requires OpenCV; compile with USE_OPENCV.";

From fe9e58d6360d99cde0a883a06590631bb11911e0 Mon Sep 17 00:00:00 2001
From: zhuyuanhao <nju.zhuyuanhao@gmail.com>
Date: Wed, 1 Mar 2017 20:42:30 +0800
Subject: [PATCH 017/144] Remove not used variable in base_conv_layer.cpp

---
 src/caffe/layers/base_conv_layer.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 4a4c68e009a..35c90145e31 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -19,7 +19,6 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   const int num_axes = bottom[0]->num_axes();
   num_spatial_axes_ = num_axes - first_spatial_axis;
   CHECK_GE(num_spatial_axes_, 0);
-  vector<int> bottom_dim_blob_shape(1, num_spatial_axes_ + 1);
   vector<int> spatial_dim_blob_shape(1, std::max(num_spatial_axes_, 1));
   // Setup filter kernel dimensions (kernel_shape_).
   kernel_shape_.Reshape(spatial_dim_blob_shape);

From 4529f12bdcd27d74655473b6665f5a23cd1214b1 Mon Sep 17 00:00:00 2001
From: gineshidalgo99 <gineshidalgo99@gmail.com>
Date: Thu, 9 Mar 2017 19:24:06 -0500
Subject: [PATCH 018/144] =?UTF-8?q?Removed=20some=20'warning:=20extra=20?=
 =?UTF-8?q?=E2=80=98;=E2=80=99=20[-Wpedantic]'?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 include/caffe/util/math_functions.hpp |  6 +++---
 include/caffe/util/mkl_alternate.hpp  | 18 +++++++++---------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 51068fe2b80..37abce5eccc 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -128,16 +128,16 @@ inline int8_t caffe_sign(Dtype val) {
   }
 
 // output is 1 for the positives, 0 for zero, and -1 for the negatives
-DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]));
+DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]))
 
 // This returns a nonzero value if the input has its sign bit set.
 // The name sngbit is meant to avoid conflicts with std::signbit in the macro.
 // The extra parens are needed because CUDA < 6.5 defines signbit as a macro,
 // and we don't want that to expand here when CUDA headers are also included.
 DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, \
-    y[i] = static_cast<bool>((std::signbit)(x[i])));
+    y[i] = static_cast<bool>((std::signbit)(x[i])))
 
-DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
+DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]))
 
 template <typename Dtype>
 void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
index 95df0f93b5e..79b2c32de94 100644
--- a/include/caffe/util/mkl_alternate.hpp
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -36,10 +36,10 @@ extern "C" {
     v##name<double>(n, a, y); \
   }
 
-DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i]);
-DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]));
-DEFINE_VSL_UNARY_FUNC(Ln, y[i] = log(a[i]));
-DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]));
+DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i])
+DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]))
+DEFINE_VSL_UNARY_FUNC(Ln, y[i] = log(a[i]))
+DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]))
 
 // A simple way to define the vsl unary functions with singular parameter b.
 // The operation should be in the form e.g. y[i] = pow(a[i], b)
@@ -58,7 +58,7 @@ DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]));
     v##name<double>(n, a, b, y); \
   }
 
-DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b));
+DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b))
 
 // A simple way to define the vsl binary functions. The operation should
 // be in the form e.g. y[i] = a[i] + b[i]
@@ -77,10 +77,10 @@ DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b));
     v##name<double>(n, a, b, y); \
   }
 
-DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i]);
-DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i]);
-DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i]);
-DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]);
+DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i])
+DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i])
+DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i])
+DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i])
 
 // In addition, MKL comes with an additional function axpby that is not present
 // in standard blas. We will simply use a two-step (inefficient, of course) way

From 93993a3c2b25ad683dbf13ef3085b0ea5912911f Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Tue, 14 Mar 2017 15:41:40 -0700
Subject: [PATCH 019/144] Init test net on all GPUs, allows parallel inference

---
 src/caffe/solver.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index fd4c03724ef..044269371ad 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -51,8 +51,8 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
   }
   // Scaffolding code
   InitTrainNet();
+  InitTestNets();
   if (Caffe::root_solver()) {
-    InitTestNets();
     LOG(INFO) << "Solver scaffolding done.";
   }
   iter_ = 0;
@@ -102,7 +102,6 @@ void Solver<Dtype>::InitTrainNet() {
 
 template <typename Dtype>
 void Solver<Dtype>::InitTestNets() {
-  CHECK(Caffe::root_solver());
   const bool has_net_param = param_.has_net_param();
   const bool has_net_file = param_.has_net();
   const int num_generic_nets = has_net_param + has_net_file;

From 802d90fe81f04e5e9c28c088da0f1b22e1b9fed2 Mon Sep 17 00:00:00 2001
From: Guillaume Dumont <dumont.guillaume@gmail.com>
Date: Thu, 16 Mar 2017 23:08:20 -0400
Subject: [PATCH 020/144] Added python 3 compatibility to cpp_lint.py

---
 scripts/cpp_lint.py | 39 ++++++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/scripts/cpp_lint.py b/scripts/cpp_lint.py
index 6ec4fb76e2c..b2016d4b6dd 100755
--- a/scripts/cpp_lint.py
+++ b/scripts/cpp_lint.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python2
+#!/usr/bin/env python
 #
 # Copyright (c) 2009 Google Inc. All rights reserved.
 #
@@ -52,6 +52,10 @@
 import sys
 import unicodedata
 
+import six
+
+from six import iteritems, itervalues
+from six.moves import xrange
 
 _USAGE = """
 Syntax: cpp_lint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
@@ -756,7 +760,7 @@ def IncrementErrorCount(self, category):
 
   def PrintErrorCounts(self):
     """Print a summary of errors by category, and the total."""
-    for category, count in self.errors_by_category.iteritems():
+    for category, count in iteritems(self.errors_by_category):
       sys.stderr.write('Category \'%s\' errors found: %d\n' %
                        (category, count))
     sys.stderr.write('Total errors found: %d\n' % self.error_count)
@@ -3444,16 +3448,16 @@ def GetLineWidth(line):
     The width of the line in column positions, accounting for Unicode
     combining characters and wide characters.
   """
-  if isinstance(line, unicode):
-    width = 0
-    for uc in unicodedata.normalize('NFC', line):
-      if unicodedata.east_asian_width(uc) in ('W', 'F'):
-        width += 2
-      elif not unicodedata.combining(uc):
-        width += 1
-    return width
-  else:
-    return len(line)
+  if six.PY2:
+    if isinstance(line, unicode):
+      width = 0
+      for uc in unicodedata.normalize('NFC', line):
+        if unicodedata.east_asian_width(uc) in ('W', 'F'):
+          width += 2
+        elif not unicodedata.combining(uc):
+          width += 1
+      return width
+  return len(line)
 
 
 def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
@@ -3774,7 +3778,7 @@ def _GetTextInside(text, start_pattern):
 
   # Give opening punctuations to get the matching close-punctuations.
   matching_punctuation = {'(': ')', '{': '}', '[': ']'}
-  closing_punctuation = set(matching_punctuation.itervalues())
+  closing_punctuation = set(itervalues(matching_punctuation))
 
   # Find the position to start extracting text.
   match = re.search(start_pattern, text, re.M)
@@ -4851,10 +4855,11 @@ def main():
 
   # Change stderr to write with replacement characters so we don't die
   # if we try to print something containing non-ASCII characters.
-  sys.stderr = codecs.StreamReaderWriter(sys.stderr,
-                                         codecs.getreader('utf8'),
-                                         codecs.getwriter('utf8'),
-                                         'replace')
+  if six.PY2:
+    sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                          codecs.getreader('utf8'),
+                                          codecs.getwriter('utf8'),
+                                          'replace')
 
   _cpplint_state.ResetErrorCounts()
   for filename in filenames:

From accd188d3241c27a6d24b95cd95a4dca4f4078bc Mon Sep 17 00:00:00 2001
From: max argus <argus.max@gmail.com>
Date: Wed, 8 Mar 2017 15:04:29 +0000
Subject: [PATCH 021/144] sane h5df file type check for weights

---
 src/caffe/net.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 70d51806d8a..353c2f95b9e 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -769,8 +769,7 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
 
 template <typename Dtype>
 void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) {
-  if (trained_filename.size() >= 3 &&
-      trained_filename.compare(trained_filename.size() - 3, 3, ".h5") == 0) {
+  if (H5Fis_hdf5(trained_filename.c_str())) {
     CopyTrainedLayersFromHDF5(trained_filename);
   } else {
     CopyTrainedLayersFromBinaryProto(trained_filename);

From 11930f1416efb66795e1fabc5e362a568446d37d Mon Sep 17 00:00:00 2001
From: "Jonathan R. Williford" <jonathan@neural.vision>
Date: Wed, 22 Mar 2017 22:36:14 +0100
Subject: [PATCH 022/144] Clarify batch norm parameter documentation.

---
 src/caffe/proto/caffe.proto | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index a145c541957..02e0ddf57c1 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -502,11 +502,21 @@ message ConcatParameter {
 }
 
 message BatchNormParameter {
-  // If false, accumulate global mean/variance values via a moving average. If
-  // true, use those accumulated values instead of computing mean/variance
-  // across the batch.
+  // If false, normalization is performed over the current mini-batch
+  // and global statistics are accumulated (but not yet used) by a moving
+  // average.
+  // If true, those accumulated mean and variance values are used for the
+  // normalization.
+  // By default, it is set to false when the network is in the training
+  // phase and true when the network is in the testing phase.
   optional bool use_global_stats = 1;
-  // How much does the moving average decay each iteration?
+  // What fraction of the moving average remains each iteration?
+  // Smaller values make the moving average decay faster, giving more
+  // weight to the recent values.
+  // Each iteration updates the moving average @f$S_{t-1}@f$ with the
+  // current mean @f$ Y_t @f$ by
+  // @f$ S_t = (1-\beta)Y_t + \beta \cdot S_{t-1} @f$, where @f$ \beta @f$
+  // is the moving_average_fraction parameter.
   optional float moving_average_fraction = 2 [default = .999];
   // Small value to add to the variance estimate so that we don't divide by
   // zero.

From 5c8e3545c650e9d3924f707334bde7cd67cf4e07 Mon Sep 17 00:00:00 2001
From: max argus <argus.max@gmail.com>
Date: Wed, 22 Mar 2017 23:15:34 +0000
Subject: [PATCH 023/144] [caffe][build] added Atlas lapack Library name
 atllapack

---
 cmake/Modules/FindAtlas.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/Modules/FindAtlas.cmake b/cmake/Modules/FindAtlas.cmake
index 9c665a47bd5..7ffa6393bbc 100644
--- a/cmake/Modules/FindAtlas.cmake
+++ b/cmake/Modules/FindAtlas.cmake
@@ -28,7 +28,7 @@ find_path(Atlas_CLAPACK_INCLUDE_DIR NAMES clapack.h PATHS ${Atlas_INCLUDE_SEARCH
 
 find_library(Atlas_CBLAS_LIBRARY NAMES  ptcblas_r ptcblas cblas_r cblas       PATHS ${Atlas_LIB_SEARCH_PATHS})
 find_library(Atlas_BLAS_LIBRARY NAMES   atlas_r   atlas                       PATHS ${Atlas_LIB_SEARCH_PATHS})
-find_library(Atlas_LAPACK_LIBRARY NAMES lapack alapack_r alapack lapack_atlas PATHS ${Atlas_LIB_SEARCH_PATHS})
+find_library(Atlas_LAPACK_LIBRARY NAMES lapack alapack_r alapack lapack_atlas atllapack PATHS ${Atlas_LIB_SEARCH_PATHS})
 
 set(LOOKED_FOR
   Atlas_CBLAS_INCLUDE_DIR
@@ -47,6 +47,6 @@ if(ATLAS_FOUND)
   set(Atlas_LIBRARIES ${Atlas_LAPACK_LIBRARY} ${Atlas_CBLAS_LIBRARY} ${Atlas_BLAS_LIBRARY})
   mark_as_advanced(${LOOKED_FOR})
 
-  message(STATUS "Found Atlas (include: ${Atlas_CBLAS_INCLUDE_DIR}, library: ${Atlas_BLAS_LIBRARY})")
+  message(STATUS "Found Atlas (include: ${Atlas_CBLAS_INCLUDE_DIR} library: ${Atlas_BLAS_LIBRARY} lapack: ${Atlas_LAPACK_LIBRARY}")
 endif(ATLAS_FOUND)
 

From 1e02d622da5aa01fbcf1185bced8e4b0daa0a50b Mon Sep 17 00:00:00 2001
From: max argus <argus.max@gmail.com>
Date: Wed, 22 Mar 2017 23:24:13 +0000
Subject: [PATCH 024/144] [caffe][build] added ABS_TEST_DATA_DIR var.

---
 cmake/Templates/caffe_config.h.in             | 15 ++++-----------
 include/caffe/test/test_caffe_main.hpp        |  3 +--
 src/caffe/test/test_gradient_based_solver.cpp |  2 +-
 src/caffe/test/test_hdf5_output_layer.cpp     |  3 +--
 src/caffe/test/test_hdf5data_layer.cpp        |  3 +--
 5 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in
index 45465b98305..2080c63df36 100644
--- a/cmake/Templates/caffe_config.h.in
+++ b/cmake/Templates/caffe_config.h.in
@@ -4,16 +4,9 @@
 /* Binaries directory */
 #define BINARY_FOLDER "${PROJECT_BINARY_DIR}"
 
+/* This is an absolute path so that we can run test from any build
+ * directory */
+#define ABS_TEST_DATA_DIR "${PROJECT_SOURCE_DIR}/src/caffe/test/test_data/"
+
 /* Test device */
 #define CUDA_TEST_DEVICE ${CUDA_TEST_DEVICE}
-
-/* Temporary (TODO: remove) */
-#if 1
-  #define CMAKE_SOURCE_DIR SOURCE_FOLDER "/src/"
-  #define EXAMPLES_SOURCE_DIR BINARY_FOLDER "/examples/"
-  #define CMAKE_EXT ".gen.cmake"
-#else
-  #define CMAKE_SOURCE_DIR "src/"
-  #define EXAMPLES_SOURCE_DIR "examples/"
-  #define CMAKE_EXT ""
-#endif
diff --git a/include/caffe/test/test_caffe_main.hpp b/include/caffe/test/test_caffe_main.hpp
index fc156091476..294f7e5011a 100644
--- a/include/caffe/test/test_caffe_main.hpp
+++ b/include/caffe/test/test_caffe_main.hpp
@@ -18,9 +18,8 @@ using std::endl;
   #include "caffe_config.h"
 #else
   #define CUDA_TEST_DEVICE -1
-  #define CMAKE_SOURCE_DIR "src/"
   #define EXAMPLES_SOURCE_DIR "examples/"
-  #define CMAKE_EXT ""
+  #define ABS_TEST_DATA_DIR "src/caffe/test/test_data"
 #endif
 
 int main(int argc, char** argv);
diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp
index 6ad0d8f6544..465140f28a6 100644
--- a/src/caffe/test/test_gradient_based_solver.cpp
+++ b/src/caffe/test/test_gradient_based_solver.cpp
@@ -28,7 +28,7 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
       seed_(1701), num_(4), channels_(3), height_(10), width_(10),
       share_(false) {
         input_file_ = new string(
-        CMAKE_SOURCE_DIR "caffe/test/test_data/solver_data_list.txt" CMAKE_EXT);
+        ABS_TEST_DATA_DIR "/solver_data_list.txt");
       }
   ~GradientBasedSolverTest() {
     delete input_file_;
diff --git a/src/caffe/test/test_hdf5_output_layer.cpp b/src/caffe/test/test_hdf5_output_layer.cpp
index 2bc2de1e647..f94dd57e7de 100644
--- a/src/caffe/test/test_hdf5_output_layer.cpp
+++ b/src/caffe/test/test_hdf5_output_layer.cpp
@@ -20,8 +20,7 @@ class HDF5OutputLayerTest : public MultiDeviceTest<TypeParam> {
 
  protected:
   HDF5OutputLayerTest()
-      : input_file_name_(
-        CMAKE_SOURCE_DIR "caffe/test/test_data/sample_data.h5"),
+      : input_file_name_(ABS_TEST_DATA_DIR "/sample_data.h5"),
         blob_data_(new Blob<Dtype>()),
         blob_label_(new Blob<Dtype>()),
         num_(5),
diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp
index 487f5176caf..3977c4866c7 100644
--- a/src/caffe/test/test_hdf5data_layer.cpp
+++ b/src/caffe/test/test_hdf5data_layer.cpp
@@ -30,8 +30,7 @@ class HDF5DataLayerTest : public MultiDeviceTest<TypeParam> {
     blob_top_vec_.push_back(blob_top_label2_);
 
     // Check out generate_sample_data.py in the same directory.
-    filename = new string(
-    CMAKE_SOURCE_DIR "caffe/test/test_data/sample_data_list.txt" CMAKE_EXT);
+    filename = new string(ABS_TEST_DATA_DIR "/sample_data_list.txt");
     LOG(INFO)<< "Using sample HDF5 data file " << filename;
   }
 

From 8602a238a712d50ac5a2d7dffadee2f34d755e3f Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Mon, 27 Mar 2017 11:33:06 -0700
Subject: [PATCH 025/144] Expose share_weights to python to allow running test
 nets

---
 python/caffe/_caffe.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index be011699098..276f21f85a5 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -298,6 +298,10 @@ void Solver_add_nccl(Solver<Dtype>* solver
 #endif
 }
 
+void share_weights(Solver<Dtype>* solver, Net<Dtype>* net) {
+  net->ShareTrainedLayersWith(solver->net().get());
+}
+
 template<typename Dtype>
 class NetCallback: public Net<Dtype>::Callback {
  public:
@@ -459,6 +463,7 @@ BOOST_PYTHON_MODULE(_caffe) {
     .def("step", &Solver<Dtype>::Step)
     .def("restore", &Solver<Dtype>::Restore)
     .def("snapshot", &Solver<Dtype>::Snapshot)
+    .def("share_weights", &share_weights)
     .add_property("param", bp::make_function(&Solver<Dtype>::param,
               bp::return_value_policy<bp::copy_const_reference>()));
   BP_REGISTER_SHARED_PTR_TO_PYTHON(Solver<Dtype>);

From 850ffd8d1cf18cabe36eb269b63d693db2b167ef Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Mon, 27 Mar 2017 13:15:18 -0700
Subject: [PATCH 026/144] Remove missed legacy parallel code

---
 include/caffe/layers/base_data_layer.hpp   | 2 --
 include/caffe/layers/data_layer.hpp        | 2 --
 include/caffe/layers/dummy_data_layer.hpp  | 2 --
 include/caffe/layers/hdf5_data_layer.hpp   | 2 --
 include/caffe/layers/hdf5_output_layer.hpp | 2 --
 include/caffe/layers/input_layer.hpp       | 2 --
 include/caffe/layers/python_layer.hpp      | 4 ----
 src/caffe/proto/caffe.proto                | 4 +---
 8 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/include/caffe/layers/base_data_layer.hpp b/include/caffe/layers/base_data_layer.hpp
index 21d3ada50d0..c8b6998c8f2 100644
--- a/include/caffe/layers/base_data_layer.hpp
+++ b/include/caffe/layers/base_data_layer.hpp
@@ -26,8 +26,6 @@ class BaseDataLayer : public Layer<Dtype> {
   // This method may not be overridden except by the BasePrefetchingDataLayer.
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
   // Data layers have no bottoms, so reshaping is trivial.
diff --git a/include/caffe/layers/data_layer.hpp b/include/caffe/layers/data_layer.hpp
index dec58180976..667a4ae43a5 100644
--- a/include/caffe/layers/data_layer.hpp
+++ b/include/caffe/layers/data_layer.hpp
@@ -20,8 +20,6 @@ class DataLayer : public BasePrefetchingDataLayer<Dtype> {
   virtual ~DataLayer();
   virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // DataLayer uses DataReader instead for sharing for parallelism
-  virtual inline bool ShareInParallel() const { return false; }
   virtual inline const char* type() const { return "Data"; }
   virtual inline int ExactNumBottomBlobs() const { return 0; }
   virtual inline int MinTopBlobs() const { return 1; }
diff --git a/include/caffe/layers/dummy_data_layer.hpp b/include/caffe/layers/dummy_data_layer.hpp
index 4180f1d01e4..13a63d47ec4 100644
--- a/include/caffe/layers/dummy_data_layer.hpp
+++ b/include/caffe/layers/dummy_data_layer.hpp
@@ -22,8 +22,6 @@ class DummyDataLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/hdf5_data_layer.hpp b/include/caffe/layers/hdf5_data_layer.hpp
index 650a3fb0c87..601b36c6b89 100644
--- a/include/caffe/layers/hdf5_data_layer.hpp
+++ b/include/caffe/layers/hdf5_data_layer.hpp
@@ -27,8 +27,6 @@ class HDF5DataLayer : public Layer<Dtype> {
   virtual ~HDF5DataLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/hdf5_output_layer.hpp b/include/caffe/layers/hdf5_output_layer.hpp
index 487d08fc06c..061e279d7a0 100644
--- a/include/caffe/layers/hdf5_output_layer.hpp
+++ b/include/caffe/layers/hdf5_output_layer.hpp
@@ -28,8 +28,6 @@ class HDF5OutputLayer : public Layer<Dtype> {
   virtual ~HDF5OutputLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/input_layer.hpp b/include/caffe/layers/input_layer.hpp
index f4472678c69..0ffdc724894 100644
--- a/include/caffe/layers/input_layer.hpp
+++ b/include/caffe/layers/input_layer.hpp
@@ -22,8 +22,6 @@ class InputLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/python_layer.hpp b/include/caffe/layers/python_layer.hpp
index 10c4bfd0250..1407d9217aa 100644
--- a/include/caffe/layers/python_layer.hpp
+++ b/include/caffe/layers/python_layer.hpp
@@ -34,10 +34,6 @@ class PythonLayer : public Layer<Dtype> {
     self_.attr("reshape")(bottom, top);
   }
 
-  virtual inline bool ShareInParallel() const {
-    return this->layer_param_.python_param().share_in_parallel();
-  }
-
   virtual inline const char* type() const { return "Python"; }
 
  protected:
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 02e0ddf57c1..8e528e8e083 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -937,9 +937,7 @@ message PythonParameter {
   // string, dictionary in Python dict format, JSON, etc. You may parse this
   // string in `setup` method and use it in `forward` and `backward`.
   optional string param_str = 3 [default = ''];
-  // Whether this PythonLayer is shared among worker solvers during data parallelism.
-  // If true, each worker solver sequentially run forward from this layer.
-  // This value should be set true if you are using it as a data layer.
+  // DEPRECATED
   optional bool share_in_parallel = 4 [default = false];
 }
 

From 9bd80b2f12649c6336b64c8ebcc2d1210755d1c7 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yuduowu@users.noreply.github.com>
Date: Wed, 29 Mar 2017 14:42:36 -0700
Subject: [PATCH 027/144] Fix typo in test_caffe_main.cpp: defice -> device

---
 src/caffe/test/test_caffe_main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/test/test_caffe_main.cpp b/src/caffe/test/test_caffe_main.cpp
index 6473b74d0a6..8f333bd7105 100644
--- a/src/caffe/test/test_caffe_main.cpp
+++ b/src/caffe/test/test_caffe_main.cpp
@@ -15,7 +15,7 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   caffe::GlobalInit(&argc, &argv);
 #ifndef CPU_ONLY
-  // Before starting testing, let's first print out a few cuda defice info.
+  // Before starting testing, let's first print out a few cuda device info.
   int device;
   cudaGetDeviceCount(&device);
   cout << "Cuda number of devices: " << device << endl;

From a32114e6b2e098e2fdef47e397542b105eb58b66 Mon Sep 17 00:00:00 2001
From: Will Crichton <wcrichto@stanford.edu>
Date: Fri, 31 Mar 2017 11:22:22 -0400
Subject: [PATCH 028/144] Fixed memory leaks in cudnn conv and relu

---
 src/caffe/layers/cudnn_conv_layer.cpp | 1 +
 src/caffe/layers/cudnn_relu_layer.cpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
index 1987fb096b0..efc9e04e8c0 100644
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -252,6 +252,7 @@ CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() {
   }
 
   cudaFree(workspaceData);
+  delete [] workspace;
   delete [] stream_;
   delete [] handle_;
   delete [] fwd_algo_;
diff --git a/src/caffe/layers/cudnn_relu_layer.cpp b/src/caffe/layers/cudnn_relu_layer.cpp
index 795e0a9efb0..687c905763e 100644
--- a/src/caffe/layers/cudnn_relu_layer.cpp
+++ b/src/caffe/layers/cudnn_relu_layer.cpp
@@ -36,6 +36,7 @@ CuDNNReLULayer<Dtype>::~CuDNNReLULayer() {
 
   cudnnDestroyTensorDescriptor(this->bottom_desc_);
   cudnnDestroyTensorDescriptor(this->top_desc_);
+  cudnnDestroyActivationDescriptor(this->activ_desc_);
   cudnnDestroy(this->handle_);
 }
 

From a2601eddf65bab54429244e350899b6d994f4f37 Mon Sep 17 00:00:00 2001
From: Luke Yeager <lukeyeager@users.noreply.github.com>
Date: Fri, 31 Mar 2017 11:01:13 -0700
Subject: [PATCH 029/144] Revert "Fix Python net drawing script"

This reverts commit db6cf0a728cad63c93b345f2203f3ad1f5d5c2f4.
---
 python/caffe/draw.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/caffe/draw.py b/python/caffe/draw.py
index e4fd7aacce7..9eecf6d7b46 100644
--- a/python/caffe/draw.py
+++ b/python/caffe/draw.py
@@ -104,11 +104,11 @@ def get_layer_label(layer, rankdir):
                       pooling_types_dict[layer.pooling_param.pool],
                       layer.type,
                       separator,
-                      layer.pooling_param.kernel_size[0] if len(layer.pooling_param.kernel_size._values) else 1,
+                      layer.pooling_param.kernel_size,
                       separator,
-                      layer.pooling_param.stride[0] if len(layer.pooling_param.stride._values) else 1,
+                      layer.pooling_param.stride,
                       separator,
-                      layer.pooling_param.pad[0] if len(layer.pooling_param.pad._values) else 0)
+                      layer.pooling_param.pad)
     else:
         node_label = '"%s%s(%s)"' % (layer.name, separator, layer.type)
     return node_label

From 0096fe3d270a4833479076e18492de8b28564c80 Mon Sep 17 00:00:00 2001
From: Felix Abecassis <fabecassis@nvidia.com>
Date: Fri, 31 Mar 2017 11:18:39 -0700
Subject: [PATCH 030/144] Add support for cuDNN v6

Support for cuDNN v4 and v5 is preserved.
---
 docs/installation.md           |  4 ++--
 include/caffe/util/cudnn.hpp   | 10 ++++++++++
 scripts/travis/install-deps.sh |  2 +-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/docs/installation.md b/docs/installation.md
index 2e558027678..42f1d0ce09b 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -42,14 +42,14 @@ Optional dependencies:
 
 * [OpenCV](http://opencv.org/) >= 2.4 including 3.0
 * IO libraries: `lmdb`, `leveldb` (note: leveldb requires `snappy`)
-* cuDNN for GPU acceleration (v5)
+* cuDNN for GPU acceleration (v6)
 
 Pycaffe and Matcaffe interfaces have their own natural needs.
 
 * For Python Caffe:  `Python 2.7` or `Python 3.3+`, `numpy (>= 1.7)`, boost-provided `boost.python`
 * For MATLAB Caffe: MATLAB with the `mex` compiler.
 
-**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v5; older versions are supported in older Caffe.
+**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v6; older versions are supported in older Caffe.
 
 **CPU-only Caffe**: for cold-brewed CPU-only Caffe uncomment the `CPU_ONLY := 1` flag in `Makefile.config` to configure and build Caffe without CUDA. This is helpful for cloud or cluster deployment.
 
diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp
index a7d8dbbad4c..498cfe385de 100644
--- a/include/caffe/util/cudnn.hpp
+++ b/include/caffe/util/cudnn.hpp
@@ -41,6 +41,10 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
       return "CUDNN_STATUS_NOT_SUPPORTED";
     case CUDNN_STATUS_LICENSE_ERROR:
       return "CUDNN_STATUS_LICENSE_ERROR";
+#if CUDNN_VERSION_MIN(6, 0, 0)
+    case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING:
+      return "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING";
+#endif
   }
   return "Unknown cudnn status";
 }
@@ -109,8 +113,14 @@ template <typename Dtype>
 inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv,
     cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter,
     int pad_h, int pad_w, int stride_h, int stride_w) {
+#if CUDNN_VERSION_MIN(6, 0, 0)
   CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
+      pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION,
+      dataType<Dtype>::type));
+#else
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
       pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION));
+#endif
 }
 
 template <typename Dtype>
diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
index 1900b16df54..1593ed8b59a 100755
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@@ -104,7 +104,7 @@ if $WITH_CUDA ; then
   ln -s /usr/local/cuda-$CUDA_VERSION /usr/local/cuda
 
   if $WITH_CUDNN ; then
-    apt-get install -y --no-install-recommends libcudnn5-dev
+    apt-get install -y --no-install-recommends libcudnn6-dev
   fi
 fi
 

From 179dafdb1a930cf86ff0956618bf8411b8dcd90e Mon Sep 17 00:00:00 2001
From: Luke Yeager <lukeyeager@users.noreply.github.com>
Date: Fri, 31 Mar 2017 11:24:56 -0700
Subject: [PATCH 031/144] Add test for caffe.draw.draw_net()

---
 python/caffe/test/test_draw.py        | 33 +++++++++++++++++++++++++++
 scripts/travis/install-deps.sh        |  2 ++
 scripts/travis/install-python-deps.sh |  1 +
 3 files changed, 36 insertions(+)
 create mode 100644 python/caffe/test/test_draw.py

diff --git a/python/caffe/test/test_draw.py b/python/caffe/test/test_draw.py
new file mode 100644
index 00000000000..1634145ee9d
--- /dev/null
+++ b/python/caffe/test/test_draw.py
@@ -0,0 +1,33 @@
+import os
+import unittest
+
+from google import protobuf
+
+import caffe.draw
+from caffe.proto import caffe_pb2
+
+def getFilenames():
+    """Yields files in the source tree which are Net prototxts."""
+    result = []
+
+    root_dir = os.path.abspath(os.path.join(
+        os.path.dirname(__file__), '..', '..', '..'))
+    assert os.path.exists(root_dir)
+
+    for dirname in ('models', 'examples'):
+        dirname = os.path.join(root_dir, dirname)
+        assert os.path.exists(dirname)
+        for cwd, _, filenames in os.walk(dirname):
+            for filename in filenames:
+                filename = os.path.join(cwd, filename)
+                if filename.endswith('.prototxt') and 'solver' not in filename:
+                    yield os.path.join(dirname, filename)
+
+
+class TestDraw(unittest.TestCase):
+    def test_draw_net(self):
+        for filename in getFilenames():
+            net = caffe_pb2.NetParameter()
+            with open(filename) as infile:
+                protobuf.text_format.Merge(infile.read(), net)
+            caffe.draw.draw_net(net, 'LR')
diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
index 1900b16df54..59a9163d5fc 100755
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@@ -8,6 +8,7 @@ source $BASEDIR/defaults.sh
 apt-get -y update
 apt-get install -y --no-install-recommends \
   build-essential \
+  graphviz \
   libboost-filesystem-dev \
   libboost-python-dev \
   libboost-system-dev \
@@ -31,6 +32,7 @@ if ! $WITH_PYTHON3 ; then
     python-dev \
     python-numpy \
     python-protobuf \
+    python-pydot \
     python-skimage
 else
   # Python3
diff --git a/scripts/travis/install-python-deps.sh b/scripts/travis/install-python-deps.sh
index eeec302791f..910d35a93be 100755
--- a/scripts/travis/install-python-deps.sh
+++ b/scripts/travis/install-python-deps.sh
@@ -11,4 +11,5 @@ if ! $WITH_PYTHON3 ; then
 else
   # Python3
   pip install --pre protobuf==3.0.0b3
+  pip install pydot
 fi

From 41e34c9061e9577c2b1dd56be65fd23ef26457fd Mon Sep 17 00:00:00 2001
From: Nitheesh <nitheeshas91@gmail.com>
Date: Tue, 4 Apr 2017 13:36:20 +0530
Subject: [PATCH 032/144] Minor fix for net drawing script

---
 python/caffe/draw.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/caffe/draw.py b/python/caffe/draw.py
index 9eecf6d7b46..8411a41d1d4 100644
--- a/python/caffe/draw.py
+++ b/python/caffe/draw.py
@@ -91,11 +91,11 @@ def get_layer_label(layer, rankdir):
                       separator,
                       layer.type,
                       separator,
-                      layer.convolution_param.kernel_size[0] if len(layer.convolution_param.kernel_size._values) else 1,
+                      layer.convolution_param.kernel_size[0] if len(layer.convolution_param.kernel_size) else 1,
                       separator,
-                      layer.convolution_param.stride[0] if len(layer.convolution_param.stride._values) else 1,
+                      layer.convolution_param.stride[0] if len(layer.convolution_param.stride) else 1,
                       separator,
-                      layer.convolution_param.pad[0] if len(layer.convolution_param.pad._values) else 0)
+                      layer.convolution_param.pad[0] if len(layer.convolution_param.pad) else 0)
     elif layer.type == 'Pooling':
         pooling_types_dict = get_pooling_types_dict()
         node_label = '"%s%s(%s %s)%skernel size: %d%sstride: %d%spad: %d"' %\

From 31bfe8fb498ea2e528da6463c9045b397992e028 Mon Sep 17 00:00:00 2001
From: Nitheesh <nitheeshas91@gmail.com>
Date: Tue, 4 Apr 2017 13:40:31 +0530
Subject: [PATCH 033/144] Add main() for draw_net unittest, fix import errors

---
 python/caffe/test/test_draw.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/caffe/test/test_draw.py b/python/caffe/test/test_draw.py
index 1634145ee9d..835bb5df010 100644
--- a/python/caffe/test/test_draw.py
+++ b/python/caffe/test/test_draw.py
@@ -1,7 +1,7 @@
 import os
 import unittest
 
-from google import protobuf
+from google.protobuf import text_format
 
 import caffe.draw
 from caffe.proto import caffe_pb2
@@ -29,5 +29,9 @@ def test_draw_net(self):
         for filename in getFilenames():
             net = caffe_pb2.NetParameter()
             with open(filename) as infile:
-                protobuf.text_format.Merge(infile.read(), net)
+                text_format.Merge(infile.read(), net)
             caffe.draw.draw_net(net, 'LR')
+
+
+if __name__ == "__main__":
+    unittest.main()

From 5f1ca848f8c9daa73f61f64413e15ab2cd6602e7 Mon Sep 17 00:00:00 2001
From: "Jonathan R. Williford" <jonathan@neural.vision>
Date: Wed, 5 Apr 2017 10:03:31 +0000
Subject: [PATCH 034/144] Add example and small blurb about sigmoid layer.

---
 docs/tutorial/layers/sigmoid.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/tutorial/layers/sigmoid.md b/docs/tutorial/layers/sigmoid.md
index 505318352c9..f18ac4b84ec 100644
--- a/docs/tutorial/layers/sigmoid.md
+++ b/docs/tutorial/layers/sigmoid.md
@@ -9,6 +9,16 @@ title: Sigmoid Layer
 * Header: [`./include/caffe/layers/sigmoid_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/sigmoid_layer.hpp)
 * CPU implementation: [`./src/caffe/layers/sigmoid_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_layer.cpp)
 * CUDA GPU implementation: [`./src/caffe/layers/sigmoid_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_layer.cu)
+* Example (from [`./examples/mnist/mnist_autoencoder.prototxt`](https://github.com/BVLC/caffe/blob/master/examples/mnist/mnist_autoencoder.prototxt)):
+
+      layer {
+        name: "encode1neuron"
+        bottom: "encode1"
+        top: "encode1neuron"
+        type: "Sigmoid"
+      }
+
+The `Sigmoid` layer computes `sigmoid(x)` for each element `x` in the bottom blob.
 
 ## Parameters
 

From ce7193c7385298825c8cabebd20f664f3f93f06a Mon Sep 17 00:00:00 2001
From: Guillaume Dumont <dumont.guillaume@gmail.com>
Date: Sat, 8 Apr 2017 12:59:24 -0400
Subject: [PATCH 035/144] Removed repeated import Layer, get_solver

---
 python/caffe/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index 43a0c49be63..80f51716d82 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,5 +1,5 @@
 from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver, NCCL, Timer
-from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, set_multiprocess, Layer, get_solver
+from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, set_multiprocess
 from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier

From b2a95fa7fcba2089b981eb30b47d9aeba2b89ce9 Mon Sep 17 00:00:00 2001
From: Bruno Bowden <github@brunobowden.com>
Date: Sat, 8 Apr 2017 15:54:04 -0700
Subject: [PATCH 036/144] Log shape dimensions for eltwise layer shape mismatch

When layer shapes mismatch for the eltwise layer, caffe will fail a
check but doesn't give any information on how the shapes mismatch.
This logging information will make it easier to debug. Additionally
this reorders the variables to CHECK(expected == actual), matching
the JUnit convention.

BEFORE: Check failed: bottom[i]->shape() == bottom[0]->shape()

AFTER:  Check failed: bottom[0]->shape() == bottom[i]->shape()
        bottom[0]: 1 4 (4), bottom[3]: 1 6 (6)

NOTE: This removes use of CHECK_EQ in an earlier version of this PR,
which caused a build warning due to include of glog/stl_logging.h.
---
 src/caffe/layers/eltwise_layer.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index 21256166bfa..3d82b0e1cbf 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -31,7 +31,9 @@ template <typename Dtype>
 void EltwiseLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   for (int i = 1; i < bottom.size(); ++i) {
-    CHECK(bottom[i]->shape() == bottom[0]->shape());
+    CHECK(bottom[0]->shape() == bottom[i]->shape())
+        << "bottom[0]: " << bottom[0]->shape_string()
+        << ", bottom[" << i << "]: " << bottom[i]->shape_string();
   }
   top[0]->ReshapeLike(*bottom[0]);
   // If max operation, we will initialize the vector index part.

From 51728d1532dbee2853acb89a8a9653e82219953b Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Wed, 12 Apr 2017 01:42:59 -0700
Subject: [PATCH 037/144] Fix log parsing #5422

---
 tools/extra/parse_log.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/extra/parse_log.sh b/tools/extra/parse_log.sh
index 9892c897682..122eb9e6eed 100755
--- a/tools/extra/parse_log.sh
+++ b/tools/extra/parse_log.sh
@@ -39,7 +39,7 @@ rm aux.txt aux0.txt aux1.txt aux2.txt aux3.txt aux4.txt
 grep '] Solving ' $1 > aux.txt
 grep ', loss = ' $1 >> aux.txt
 grep 'Iteration ' aux.txt | sed  's/.*Iteration \([[:digit:]]*\).*/\1/g' > aux0.txt
-grep ', loss = ' $1 | awk '{print $9}' > aux1.txt
+grep ', loss = ' $1 | awk -F = '{print $2}' > aux1.txt
 grep ', lr = ' $1 | awk '{print $9}' > aux2.txt
 
 # Extracting elapsed seconds

From bac59bed485dfa195600b5b12031401613fade05 Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Wed, 12 Apr 2017 02:05:34 -0700
Subject: [PATCH 038/144] Allow using env vars for glog init from python

---
 python/caffe/_caffe.cpp | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index 276f21f85a5..01b34b84190 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -51,14 +51,18 @@ const int NPY_DTYPE = NPY_FLOAT32;
 void set_mode_cpu() { Caffe::set_mode(Caffe::CPU); }
 void set_mode_gpu() { Caffe::set_mode(Caffe::GPU); }
 
-void InitLog(int level) {
-  FLAGS_logtostderr = 1;
-  FLAGS_minloglevel = level;
+void InitLog() {
   ::google::InitGoogleLogging("");
   ::google::InstallFailureSignalHandler();
 }
-void InitLogInfo() {
-  InitLog(google::INFO);
+void InitLogLevel(int level) {
+  FLAGS_minloglevel = level;
+  InitLog();
+}
+void InitLogLevelPipe(int level, bool stderr) {
+  FLAGS_minloglevel = level;
+  FLAGS_logtostderr = stderr;
+  InitLog();
 }
 void Log(const string& s) {
   LOG(INFO) << s;
@@ -353,7 +357,8 @@ BOOST_PYTHON_MODULE(_caffe) {
 
   // Caffe utility functions
   bp::def("init_log", &InitLog);
-  bp::def("init_log", &InitLogInfo);
+  bp::def("init_log", &InitLogLevel);
+  bp::def("init_log", &InitLogLevelPipe);
   bp::def("log", &Log);
   bp::def("set_mode_cpu", &set_mode_cpu);
   bp::def("set_mode_gpu", &set_mode_gpu);

From 35a7b87ad87457291dfc79bf8a7e7cf7ef278cbb Mon Sep 17 00:00:00 2001
From: Noiredd <snowball91b@gmail.com>
Date: Wed, 12 Apr 2017 11:59:06 +0200
Subject: [PATCH 039/144] fixes pycaffe forward() and backward() behavior for
 nets whose layer names do not match respective tops

---
 python/caffe/pycaffe.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index 63606591bb4..4a7b5a24c46 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -113,7 +113,7 @@ def _Net_forward(self, blobs=None, start=None, end=None, **kwargs):
 
     if end is not None:
         end_ind = list(self._layer_names).index(end)
-        outputs = set([end] + blobs)
+        outputs = set(self.top_names[end] + blobs)
     else:
         end_ind = len(self.layers) - 1
         outputs = set(self.outputs + blobs)
@@ -161,7 +161,7 @@ def _Net_backward(self, diffs=None, start=None, end=None, **kwargs):
 
     if end is not None:
         end_ind = list(self._layer_names).index(end)
-        outputs = set([end] + diffs)
+        outputs = set(self.bottom_names[end] + diffs)
     else:
         end_ind = 0
         outputs = set(self.inputs + diffs)

From 3a987960d6a08b179eb6c0c526b27ab761ea2d6e Mon Sep 17 00:00:00 2001
From: Kang Kim <kangk@qti.qualcomm.com>
Date: Thu, 13 Apr 2017 15:23:26 +0900
Subject: [PATCH 040/144] remove redundant check in LSTMUnitLayer

---
 src/caffe/layers/lstm_unit_layer.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/caffe/layers/lstm_unit_layer.cpp b/src/caffe/layers/lstm_unit_layer.cpp
index 277c031ad15..d1ab59c4bd1 100644
--- a/src/caffe/layers/lstm_unit_layer.cpp
+++ b/src/caffe/layers/lstm_unit_layer.cpp
@@ -31,7 +31,6 @@ void LSTMUnitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     CHECK_EQ(num_instances, bottom[i]->shape(1));
   }
   hidden_dim_ = bottom[0]->shape(2);
-  CHECK_EQ(num_instances, bottom[1]->shape(1));
   CHECK_EQ(4 * hidden_dim_, bottom[1]->shape(2));
   top[0]->ReshapeLike(*bottom[0]);
   top[1]->ReshapeLike(*bottom[0]);

From 96870628698090813d92a9b1f8af9a8311469354 Mon Sep 17 00:00:00 2001
From: Guillaume Dumont <dumont.guillaume@gmail.com>
Date: Thu, 13 Apr 2017 13:15:24 -0400
Subject: [PATCH 041/144] Bump boost version to 1.55 in CMake build

---
 cmake/Dependencies.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 02c81525bce..4a5bac471b4 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -5,7 +5,7 @@ set(Caffe_DEFINITIONS "")
 set(Caffe_COMPILE_OPTIONS "")
 
 # ---[ Boost
-find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem)
+find_package(Boost 1.55 REQUIRED COMPONENTS system thread filesystem)
 list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${Boost_INCLUDE_DIRS})
 list(APPEND Caffe_LINKER_LIBS PUBLIC ${Boost_LIBRARIES})
 

From 0c9cc62379e4061b58b0dfa257d79c2ecaeb2be8 Mon Sep 17 00:00:00 2001
From: Guillaume Dumont <dumont.guillaume@gmail.com>
Date: Sat, 11 Mar 2017 20:12:40 -0500
Subject: [PATCH 042/144] Added support for python 3 and NCCL

---
 python/caffe/__init__.py       |  2 +-
 python/caffe/_caffe.cpp        | 32 +++++++++++++++++++++++++++++++-
 python/caffe/test/test_nccl.py | 19 +++++++++++++++++++
 3 files changed, 51 insertions(+), 2 deletions(-)
 create mode 100644 python/caffe/test/test_nccl.py

diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index 80f51716d82..776945eec88 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,5 +1,5 @@
 from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver, NCCL, Timer
-from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, set_multiprocess
+from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, set_multiprocess, has_nccl
 from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index 01b34b84190..7fc06c08f73 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -347,6 +347,35 @@ class NCCL {
 };
 #endif
 
+bool HasNCCL() {
+#ifdef USE_NCCL
+  return true;
+#else
+  return false;
+#endif
+}
+
+#ifdef USE_NCCL
+bp::object NCCL_New_Uid() {
+  std::string uid = NCCL<Dtype>::new_uid();
+#if PY_MAJOR_VERSION >= 3
+  // Convert std::string to bytes so that Python does not
+  // try to decode the string using the current locale.
+
+  // Since boost 1.53 boost.python will convert str and bytes
+  // to std::string but will convert std::string to str. Here we
+  // force a bytes object to be returned. When this object
+  // is passed back to the NCCL constructor boost.python will
+  // correctly convert the bytes to std::string automatically
+  PyObject* py_uid = PyBytes_FromString(uid.c_str());
+  return bp::object(bp::handle<>(py_uid));
+#else
+  // automatic conversion is correct for python 2.
+  return uid;
+#endif
+}
+#endif
+
 BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(SolveOverloads, Solve, 0, 1);
 
 BOOST_PYTHON_MODULE(_caffe) {
@@ -360,6 +389,7 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::def("init_log", &InitLogLevel);
   bp::def("init_log", &InitLogLevelPipe);
   bp::def("log", &Log);
+  bp::def("has_nccl", &HasNCCL);
   bp::def("set_mode_cpu", &set_mode_cpu);
   bp::def("set_mode_gpu", &set_mode_gpu);
   bp::def("set_random_seed", &set_random_seed);
@@ -518,7 +548,7 @@ BOOST_PYTHON_MODULE(_caffe) {
     boost::noncopyable>("NCCL",
                         bp::init<shared_ptr<Solver<Dtype> >, const string&>())
 #ifdef USE_NCCL
-    .def("new_uid", &NCCL<Dtype>::new_uid).staticmethod("new_uid")
+    .def("new_uid", NCCL_New_Uid).staticmethod("new_uid")
     .def("bcast", &NCCL<Dtype>::Broadcast)
 #endif
     /* NOLINT_NEXT_LINE(whitespace/semicolon) */
diff --git a/python/caffe/test/test_nccl.py b/python/caffe/test/test_nccl.py
new file mode 100644
index 00000000000..127a9337040
--- /dev/null
+++ b/python/caffe/test/test_nccl.py
@@ -0,0 +1,19 @@
+import sys
+import unittest
+
+import caffe
+
+
+class TestNCCL(unittest.TestCase):
+
+    def test_newuid(self):
+        """
+        Test that NCCL uids are of the proper type
+        according to python version
+        """
+        if caffe.has_nccl():
+            uid = caffe.NCCL.new_uid()
+            if sys.version_info.major >= 3:
+                self.assertTrue(isinstance(uid, bytes))
+            else:
+                self.assertTrue(isinstance(uid, str))

From e98023af4a570e3105486b661e4c4d1855c0dd79 Mon Sep 17 00:00:00 2001
From: Patrick Follmann <follmann@mvtec.com>
Date: Thu, 29 Dec 2016 14:37:21 +0100
Subject: [PATCH 043/144] Add GPU sqrt functions

---
 include/caffe/util/math_functions.hpp |  3 +++
 src/caffe/util/math_functions.cu      | 21 +++++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 37abce5eccc..60a8404a044 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -214,6 +214,9 @@ void caffe_gpu_log(const int n, const Dtype* a, Dtype* y);
 template <typename Dtype>
 void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
+template <typename Dtype>
+void caffe_gpu_sqrt(const int n, const Dtype* a, Dtype* y);
+
 // caffe_gpu_rng_uniform with two arguments generates integers in the range
 // [0, UINT_MAX].
 void caffe_gpu_rng_uniform(const int n, unsigned int* r);
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
index 6d001026082..314e6ba0f63 100644
--- a/src/caffe/util/math_functions.cu
+++ b/src/caffe/util/math_functions.cu
@@ -387,6 +387,27 @@ void caffe_gpu_powx<double>(const int N, const double* a,
       N, a, alpha, y);
 }
 
+template <typename Dtype>
+__global__ void sqrt_kernel(const int n, const Dtype* a, Dtype* y) {
+  CUDA_KERNEL_LOOP(index, n) {
+    y[index] = sqrt(a[index]);
+  }
+}
+
+template <>
+void caffe_gpu_sqrt<float>(const int N, const float* a, float* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  sqrt_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+      N, a, y);
+}
+
+template <>
+void caffe_gpu_sqrt<double>(const int N, const double* a, double* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  sqrt_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+      N, a, y);
+}
+
 DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
                                       - (x[index] < Dtype(0)));
 DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));

From e93fcd267582888f960ca48d6e0c2e719d4ea09b Mon Sep 17 00:00:00 2001
From: Patrick Follmann <follmann@mvtec.com>
Date: Thu, 29 Dec 2016 14:46:16 +0100
Subject: [PATCH 044/144] GPU BatchNormLayer: replace powx with mul and sqrt

---
 src/caffe/layers/batch_norm_layer.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/caffe/layers/batch_norm_layer.cu b/src/caffe/layers/batch_norm_layer.cu
index c21713c81d9..a35e778e2f1 100644
--- a/src/caffe/layers/batch_norm_layer.cu
+++ b/src/caffe/layers/batch_norm_layer.cu
@@ -48,14 +48,14 @@ void BatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 
   if (!use_global_stats_) {
     // compute variance using var(X) = E((X-EX)^2)
-    caffe_gpu_powx(top[0]->count(), top_data, Dtype(2),
+    caffe_gpu_mul(top[0]->count(), top[0]->gpu_data(), top[0]->gpu_data(),
         temp_.mutable_gpu_data());  // (X-EX)^2
     caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
         1. / (num * spatial_dim), temp_.gpu_data(),
         spatial_sum_multiplier_.gpu_data(), 0.,
         num_by_chans_.mutable_gpu_data());
-    caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
-        num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
+    caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, Dtype(1.),
+        num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), Dtype(0.),
         variance_.mutable_gpu_data());  // E((X_EX)^2)
 
     // compute and save moving average
@@ -72,7 +72,7 @@ void BatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 
   // normalize variance
   caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
-  caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
+  caffe_gpu_sqrt(variance_.count(), variance_.gpu_data(),
       variance_.mutable_gpu_data());
 
   // replicate variance to input size

From ab3398832964c1ff1bf6b78501e4e43a11f282a1 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Thu, 13 Apr 2017 13:25:16 -0700
Subject: [PATCH 045/144] Add CPU sqrt functions

---
 include/caffe/util/math_functions.hpp |  3 +++
 src/caffe/util/math_functions.cpp     | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 60a8404a044..e549120a933 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -52,6 +52,9 @@ void caffe_scal(const int N, const Dtype alpha, Dtype *X);
 template <typename Dtype>
 void caffe_sqr(const int N, const Dtype* a, Dtype* y);
 
+template <typename Dtype>
+void caffe_sqrt(const int N, const Dtype* a, Dtype* y);
+
 template <typename Dtype>
 void caffe_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 71c02274a75..59625bc05ce 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -196,6 +196,16 @@ void caffe_sqr<double>(const int n, const double* a, double* y) {
   vdSqr(n, a, y);
 }
 
+template <>
+void caffe_sqrt<float>(const int n, const float* a, float* y) {
+  vsSqrt(n, a, y);
+}
+
+template <>
+void caffe_sqrt<double>(const int n, const double* a, double* y) {
+  vdSqrt(n, a, y);
+}
+
 template <>
 void caffe_exp<float>(const int n, const float* a, float* y) {
   vsExp(n, a, y);

From 1c15d94f7da736945450e6ed321077f3045445b1 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Thu, 13 Apr 2017 13:26:16 -0700
Subject: [PATCH 046/144] CPU BatchNormLayer: replace powx with sqr and sqrt

---
 src/caffe/layers/batch_norm_layer.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/caffe/layers/batch_norm_layer.cpp b/src/caffe/layers/batch_norm_layer.cpp
index 0a08ed4cb07..c6a1d5b1b2c 100644
--- a/src/caffe/layers/batch_norm_layer.cpp
+++ b/src/caffe/layers/batch_norm_layer.cpp
@@ -124,8 +124,8 @@ void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
   if (!use_global_stats_) {
     // compute variance using var(X) = E((X-EX)^2)
-    caffe_powx(top[0]->count(), top_data, Dtype(2),
-        temp_.mutable_cpu_data());  // (X-EX)^2
+    caffe_sqr<Dtype>(top[0]->count(), top_data,
+                     temp_.mutable_cpu_data());  // (X-EX)^2
     caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
         1. / (num * spatial_dim), temp_.cpu_data(),
         spatial_sum_multiplier_.cpu_data(), 0.,
@@ -148,7 +148,7 @@ void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
   // normalize variance
   caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
-  caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5),
+  caffe_sqrt(variance_.count(), variance_.cpu_data(),
              variance_.mutable_cpu_data());
 
   // replicate variance to input size

From 3d5bed06a9b6b8a5dfd3db8da33f2fa3bc9a1213 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Thu, 13 Apr 2017 14:15:16 -0700
Subject: [PATCH 047/144] fix: add non-MKL sqrt (should have been included in
 ab33988)

---
 include/caffe/util/mkl_alternate.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
index 79b2c32de94..8c2294c7c86 100644
--- a/include/caffe/util/mkl_alternate.hpp
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -37,6 +37,7 @@ extern "C" {
   }
 
 DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i])
+DEFINE_VSL_UNARY_FUNC(Sqrt, y[i] = sqrt(a[i]))
 DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]))
 DEFINE_VSL_UNARY_FUNC(Ln, y[i] = log(a[i]))
 DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]))

From 2ec19b6177111526d2df362d29d0e08aa5645a22 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Thu, 13 Apr 2017 14:22:30 -0700
Subject: [PATCH 048/144] deprecate WindowData layer type

---
 include/caffe/layers/window_data_layer.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/caffe/layers/window_data_layer.hpp b/include/caffe/layers/window_data_layer.hpp
index 35f41b80e63..b9b66b7cf1d 100644
--- a/include/caffe/layers/window_data_layer.hpp
+++ b/include/caffe/layers/window_data_layer.hpp
@@ -16,7 +16,8 @@ namespace caffe {
 
 /**
  * @brief Provides data to the Net from windows of images files, specified
- *        by a window data file.
+ *        by a window data file. This layer is *DEPRECATED* and only kept for
+ *        archival purposes for use by the original R-CNN.
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */

From e7163f650885b9f7b9cae1c3253aa97d9fe30d86 Mon Sep 17 00:00:00 2001
From: Guillaume Dumont <dumont.guillaume@gmail.com>
Date: Thu, 13 Apr 2017 20:32:40 -0400
Subject: [PATCH 049/144] Updated Travis boost dependencies

---
 scripts/travis/install-deps.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
index 2fa2a74a486..dac5d2f9d37 100755
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@@ -9,10 +9,10 @@ apt-get -y update
 apt-get install -y --no-install-recommends \
   build-essential \
   graphviz \
-  libboost-filesystem-dev \
-  libboost-python-dev \
-  libboost-system-dev \
-  libboost-thread-dev \
+  libboost-filesystem1.55-dev \
+  libboost-python1.55-dev \
+  libboost-system1.55-dev \
+  libboost-thread1.55-dev \
   libgflags-dev \
   libgoogle-glog-dev \
   libhdf5-serial-dev \

From 8bc82c635914676d51ecd2849cc69f6fb6042496 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Thu, 13 Apr 2017 19:14:57 -0700
Subject: [PATCH 050/144] [examples] switch cifar-10 back to proto instead of
 h5 serialization

(it's more common)
---
 examples/cifar10/cifar10_quick_solver.prototxt | 1 -
 examples/cifar10/train_full.sh                 | 4 ++--
 examples/cifar10/train_quick.sh                | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/cifar10/cifar10_quick_solver.prototxt b/examples/cifar10/cifar10_quick_solver.prototxt
index 5de276f722f..14b4401ba16 100644
--- a/examples/cifar10/cifar10_quick_solver.prototxt
+++ b/examples/cifar10/cifar10_quick_solver.prototxt
@@ -20,7 +20,6 @@ display: 100
 max_iter: 4000
 # snapshot intermediate results
 snapshot: 4000
-snapshot_format: HDF5
 snapshot_prefix: "examples/cifar10/cifar10_quick"
 # solver mode: CPU or GPU
 solver_mode: GPU
diff --git a/examples/cifar10/train_full.sh b/examples/cifar10/train_full.sh
index 06ecc2dccb0..fe46e60d795 100755
--- a/examples/cifar10/train_full.sh
+++ b/examples/cifar10/train_full.sh
@@ -9,9 +9,9 @@ $TOOLS/caffe train \
 # reduce learning rate by factor of 10
 $TOOLS/caffe train \
     --solver=examples/cifar10/cifar10_full_solver_lr1.prototxt \
-    --snapshot=examples/cifar10/cifar10_full_iter_60000.solverstate.h5 $@
+    --snapshot=examples/cifar10/cifar10_full_iter_60000.solverstate $@
 
 # reduce learning rate by factor of 10
 $TOOLS/caffe train \
     --solver=examples/cifar10/cifar10_full_solver_lr2.prototxt \
-    --snapshot=examples/cifar10/cifar10_full_iter_65000.solverstate.h5 $@
+    --snapshot=examples/cifar10/cifar10_full_iter_65000.solverstate $@
diff --git a/examples/cifar10/train_quick.sh b/examples/cifar10/train_quick.sh
index d2b875340ee..257479e0d77 100755
--- a/examples/cifar10/train_quick.sh
+++ b/examples/cifar10/train_quick.sh
@@ -9,4 +9,4 @@ $TOOLS/caffe train \
 # reduce learning rate by factor of 10 after 8 epochs
 $TOOLS/caffe train \
   --solver=examples/cifar10/cifar10_quick_solver_lr1.prototxt \
-  --snapshot=examples/cifar10/cifar10_quick_iter_4000.solverstate.h5 $@
+  --snapshot=examples/cifar10/cifar10_quick_iter_4000.solverstate $@

From aa29eba26b781349174cb856b6ea96360ebbb3f2 Mon Sep 17 00:00:00 2001
From: Guillaume Dumont <dumont.guillaume@gmail.com>
Date: Thu, 13 Apr 2017 22:37:13 -0400
Subject: [PATCH 051/144] Explicit std::string to bp::object conversion

---
 python/caffe/_caffe.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index 7fc06c08f73..d7f43fff62d 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -371,7 +371,7 @@ bp::object NCCL_New_Uid() {
   return bp::object(bp::handle<>(py_uid));
 #else
   // automatic conversion is correct for python 2.
-  return uid;
+  return bp::object(uid);
 #endif
 }
 #endif

From c19c9602d031274ce77eb6a94ce2a9e8d843d98f Mon Sep 17 00:00:00 2001
From: Carl Doersch <cdoersch@cs.cmu.edu>
Date: Tue, 25 Aug 2015 11:26:14 -0700
Subject: [PATCH 052/144] Test for python forward and backward with start and
 end layer.

---
 python/caffe/test/test_net.py | 45 +++++++++++++++++++++++++++++++----
 1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/python/caffe/test/test_net.py b/python/caffe/test/test_net.py
index 24391cc50c4..afd27690981 100644
--- a/python/caffe/test/test_net.py
+++ b/python/caffe/test/test_net.py
@@ -25,11 +25,11 @@ def simple_net_file(num_output):
         bias_filler { type: 'constant' value: 2 } }
         param { decay_mult: 1 } param { decay_mult: 0 }
         }
-    layer { type: 'InnerProduct' name: 'ip' bottom: 'conv' top: 'ip'
+    layer { type: 'InnerProduct' name: 'ip' bottom: 'conv' top: 'ip_blob'
       inner_product_param { num_output: """ + str(num_output) + """
         weight_filler { type: 'gaussian' std: 2.5 }
         bias_filler { type: 'constant' value: -3 } } }
-    layer { type: 'SoftmaxWithLoss' name: 'loss' bottom: 'ip' bottom: 'label'
+    layer { type: 'SoftmaxWithLoss' name: 'loss' bottom: 'ip_blob' bottom: 'label'
       top: 'loss' }""")
     f.close()
     return f.name
@@ -71,6 +71,43 @@ def test_forward_backward(self):
         self.net.forward()
         self.net.backward()
 
+    def test_forward_start_end(self):
+        conv_blob=self.net.blobs['conv'];
+        ip_blob=self.net.blobs['ip_blob'];
+        sample_data=np.random.uniform(size=conv_blob.data.shape);
+        sample_data=sample_data.astype(np.float32);
+        conv_blob.data[:]=sample_data;
+        forward_blob=self.net.forward(start='ip',end='ip');
+        self.assertIn('ip_blob',forward_blob);
+
+        manual_forward=[];
+        for i in range(0,conv_blob.data.shape[0]):
+          dot=np.dot(self.net.params['ip'][0].data,
+                     conv_blob.data[i].reshape(-1));
+          manual_forward.append(dot+self.net.params['ip'][1].data);
+        manual_forward=np.array(manual_forward);
+
+        np.testing.assert_allclose(ip_blob.data,manual_forward,rtol=1e-3);
+
+    def test_backward_start_end(self):
+        conv_blob=self.net.blobs['conv'];
+        ip_blob=self.net.blobs['ip_blob'];
+        sample_data=np.random.uniform(size=ip_blob.data.shape)
+        sample_data=sample_data.astype(np.float32);
+        ip_blob.diff[:]=sample_data;
+        backward_blob=self.net.backward(start='ip',end='ip');
+        self.assertIn('conv',backward_blob);
+
+        manual_backward=[];
+        for i in range(0,conv_blob.data.shape[0]):
+          dot=np.dot(self.net.params['ip'][0].data.transpose(),
+                     sample_data[i].reshape(-1));
+          manual_backward.append(dot);
+        manual_backward=np.array(manual_backward);
+        manual_backward=manual_backward.reshape(conv_blob.data.shape);
+
+        np.testing.assert_allclose(conv_blob.diff,manual_backward,rtol=1e-3);
+
     def test_clear_param_diffs(self):
         # Run a forward/backward step to have non-zero diffs
         self.net.forward()
@@ -90,13 +127,13 @@ def test_top_bottom_names(self):
         self.assertEqual(self.net.top_names,
                          OrderedDict([('data', ['data', 'label']),
                                       ('conv', ['conv']),
-                                      ('ip', ['ip']),
+                                      ('ip', ['ip_blob']),
                                       ('loss', ['loss'])]))
         self.assertEqual(self.net.bottom_names,
                          OrderedDict([('data', []),
                                       ('conv', ['data']),
                                       ('ip', ['conv']),
-                                      ('loss', ['ip', 'label'])]))
+                                      ('loss', ['ip_blob', 'label'])]))
 
     def test_save_and_read(self):
         f = tempfile.NamedTemporaryFile(mode='w+', delete=False)

From 451944333510e1ea9b0bdac11e4ec201e5284714 Mon Sep 17 00:00:00 2001
From: jgyllinsky <jgyllinsky@users.noreply.github.com>
Date: Fri, 14 Apr 2017 03:11:59 -0400
Subject: [PATCH 053/144] [docs] added apt command to install OpenBLAS (#4718)

---
 docs/install_apt.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/install_apt.md b/docs/install_apt.md
index bc1566b0be9..ee2cd287701 100644
--- a/docs/install_apt.md
+++ b/docs/install_apt.md
@@ -14,7 +14,7 @@ The NVIDIA package tends to follow more recent library and driver versions, but
 If installing from packages, install the library and latest driver separately; the driver bundled with the library is usually out-of-date.
 This can be skipped for CPU-only installation.
 
-**BLAS**: install ATLAS by `sudo apt-get install libatlas-base-dev` or install OpenBLAS or MKL for better CPU performance.
+**BLAS**: install ATLAS by `sudo apt-get install libatlas-base-dev` or install OpenBLAS by `sudo apt-get install libopenblas-dev` or MKL for better CPU performance.
 
 **Python** (optional): if you use the default Python you will need to `sudo apt-get install` the `python-dev` package to have the Python headers for building the pycaffe interface.
 

From 80073497045d3101492a28a8a2c87dff65d64ff4 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 10:17:52 -0700
Subject: [PATCH 054/144] fix lint errors that snuck in by #4566

---
 src/caffe/test/test_gradient_based_solver.cpp | 12 ++++++++----
 src/caffe/test/test_neuron_layer.cpp          |  9 ++++++---
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp
index 05cab909798..f4395f5311c 100644
--- a/src/caffe/test/test_gradient_based_solver.cpp
+++ b/src/caffe/test/test_gradient_based_solver.cpp
@@ -558,9 +558,11 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const vector<Blob<Dtype>*>& params = solver_->net()->learnable_params();
     for (int i = 0; i < params.size(); ++i) {
       for (int j = 0; j < params[i]->count(); ++j) {
-        EXPECT_FLOAT_EQ(param_copies[i]->cpu_data()[j], params[i]->cpu_data()[j])
+        EXPECT_FLOAT_EQ(param_copies[i]->cpu_data()[j],
+            params[i]->cpu_data()[j])
             << "param " << i << " data differed at dim " << j;
-        EXPECT_FLOAT_EQ(param_copies[i]->cpu_diff()[j], params[i]->cpu_diff()[j])
+        EXPECT_FLOAT_EQ(param_copies[i]->cpu_diff()[j],
+            params[i]->cpu_diff()[j])
             << "param " << i << " diff differed at dim " << j;
       }
     }
@@ -569,9 +571,11 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const vector<shared_ptr<Blob<Dtype> > >& history = solver_->history();
     for (int i = 0; i < history.size(); ++i) {
       for (int j = 0; j < history[i]->count(); ++j) {
-        EXPECT_FLOAT_EQ(history_copies[i]->cpu_data()[j], history[i]->cpu_data()[j])
+        EXPECT_FLOAT_EQ(history_copies[i]->cpu_data()[j],
+            history[i]->cpu_data()[j])
             << "history blob " << i << " data differed at dim " << j;
-        EXPECT_FLOAT_EQ(history_copies[i]->cpu_diff()[j], history[i]->cpu_diff()[j])
+        EXPECT_FLOAT_EQ(history_copies[i]->cpu_diff()[j],
+            history[i]->cpu_diff()[j])
             << "history blob " << i << " diff differed at dim " << j;
       }
     }
diff --git a/src/caffe/test/test_neuron_layer.cpp b/src/caffe/test/test_neuron_layer.cpp
index 57bd47b3a2e..180871a29ee 100644
--- a/src/caffe/test/test_neuron_layer.cpp
+++ b/src/caffe/test/test_neuron_layer.cpp
@@ -791,13 +791,16 @@ TYPED_TEST(NeuronLayerTest, TestPReLUInPlace) {
   ip2.Backward(blob_middle_vec_2, propagate_down, blob_bottom_vec_2);
   // Check numbers
   for (int s = 0; s < blob_bottom_2->count(); ++s) {
-    EXPECT_FLOAT_EQ(this->blob_bottom_->cpu_diff()[s], blob_bottom_2->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(this->blob_bottom_->cpu_diff()[s],
+        blob_bottom_2->cpu_diff()[s]);
   }
   for (int s = 0; s < ip.blobs()[0]->count(); ++s) {
-    EXPECT_FLOAT_EQ(ip.blobs()[0]->cpu_diff()[s], ip2.blobs()[0]->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(ip.blobs()[0]->cpu_diff()[s],
+        ip2.blobs()[0]->cpu_diff()[s]);
   }
   for (int s = 0; s < ip.blobs()[1]->count(); ++s) {
-    EXPECT_FLOAT_EQ(ip.blobs()[1]->cpu_diff()[s], ip2.blobs()[1]->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(ip.blobs()[1]->cpu_diff()[s],
+        ip2.blobs()[1]->cpu_diff()[s]);
   }
   for (int s = 0; s < prelu.blobs()[0]->count(); ++s) {
     EXPECT_FLOAT_EQ(prelu.blobs()[0]->cpu_diff()[s],

From 4db619aec9cd384b11a1c55fac257d14b704bb15 Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Fri, 14 Apr 2017 12:30:50 -0700
Subject: [PATCH 055/144] Docker update to cuDNN 6

---
 docker/cpu/Dockerfile | 3 ++-
 docker/gpu/Dockerfile | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/docker/cpu/Dockerfile b/docker/cpu/Dockerfile
index af6c03c6589..67e2e61bd57 100644
--- a/docker/cpu/Dockerfile
+++ b/docker/cpu/Dockerfile
@@ -28,7 +28,8 @@ ENV CAFFE_ROOT=/opt/caffe
 WORKDIR $CAFFE_ROOT
 
 # FIXME: use ARG instead of ENV once DockerHub supports this
-ENV CLONE_TAG=rc4
+# https://github.com/docker/hub-feedback/issues/460
+ENV CLONE_TAG=1.0
 
 RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
     pip install --upgrade pip && \
diff --git a/docker/gpu/Dockerfile b/docker/gpu/Dockerfile
index 0785b10f1e7..dcdbdf326fb 100644
--- a/docker/gpu/Dockerfile
+++ b/docker/gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
 LABEL maintainer caffe-maint@googlegroups.com
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -28,7 +28,8 @@ ENV CAFFE_ROOT=/opt/caffe
 WORKDIR $CAFFE_ROOT
 
 # FIXME: use ARG instead of ENV once DockerHub supports this
-ENV CLONE_TAG=rc4
+# https://github.com/docker/hub-feedback/issues/460
+ENV CLONE_TAG=1.0
 
 RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
     pip install --upgrade pip && \

From 44da39f662a24de746fa83b92bd670fe41b3a7da Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 12:36:41 -0700
Subject: [PATCH 056/144] BVLC -> BAIR

Berkeley AI Research (BAIR) is the the successor to
the Berkeley Vision and Learning Center (BVLC).
---
 CONTRIBUTORS.md                               |  2 +-
 README.md                                     |  6 +++---
 docs/_layouts/default.html                    |  2 +-
 docs/development.md                           |  4 ++--
 docs/index.md                                 | 10 +++++-----
 docs/model_zoo.md                             | 18 +++++++++---------
 docs/multigpu.md                              |  4 ++--
 docs/performance_hardware.md                  |  2 +-
 docs/tutorial/interfaces.md                   |  4 ++--
 examples/finetune_flickr_style/readme.md      |  2 +-
 models/bvlc_alexnet/readme.md                 |  2 +-
 models/bvlc_googlenet/readme.md               |  2 +-
 models/bvlc_reference_caffenet/readme.md      |  2 +-
 models/bvlc_reference_rcnn_ilsvrc13/readme.md |  2 +-
 14 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 8db66ea82c6..3fd767812e9 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -1,6 +1,6 @@
 # Contributors
 
-Caffe is developed by a core set of BVLC members and the open-source community.
+Caffe is developed by a core set of BAIR members and the open-source community.
 
 We thank all of our [contributors](https://github.com/BVLC/caffe/graphs/contributors)!
 
diff --git a/README.md b/README.md
index 44b9e62c157..0ae3616b4a6 100644
--- a/README.md
+++ b/README.md
@@ -4,13 +4,13 @@
 [![License](https://img.shields.io/badge/license-BSD-blue.svg)](LICENSE)
 
 Caffe is a deep learning framework made with expression, speed, and modularity in mind.
-It is developed by the Berkeley Vision and Learning Center ([BVLC](http://bvlc.eecs.berkeley.edu)) and community contributors.
+It is developed by Berkeley AI Research ([BAIR](http://bair.berkeley.edu))/The Berkeley Vision and Learning Center (BVLC) and community contributors.
 
 Check out the [project site](http://caffe.berkeleyvision.org) for all the details like
 
 - [DIY Deep Learning for Vision with Caffe](https://docs.google.com/presentation/d/1UeKXVgRvvxg9OUdh_UiC5G71UMscNPlvArsWER41PsU/edit#slide=id.p)
 - [Tutorial Documentation](http://caffe.berkeleyvision.org/tutorial/)
-- [BVLC reference models](http://caffe.berkeleyvision.org/model_zoo.html) and the [community model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo)
+- [BAIR reference models](http://caffe.berkeleyvision.org/model_zoo.html) and the [community model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo)
 - [Installation instructions](http://caffe.berkeleyvision.org/installation.html)
 
 and step-by-step examples.
@@ -25,7 +25,7 @@ Happy brewing!
 ## License and Citation
 
 Caffe is released under the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE).
-The BVLC reference models are released for unrestricted use.
+The BAIR/BVLC reference models are released for unrestricted use.
 
 Please cite Caffe in your publications if it helps your research:
 
diff --git a/docs/_layouts/default.html b/docs/_layouts/default.html
index b8efe60bc3b..3799e95afde 100644
--- a/docs/_layouts/default.html
+++ b/docs/_layouts/default.html
@@ -36,7 +36,7 @@
       <header>
         <h1 class="header"><a href="/">Caffe</a></h1>
         <p class="header">
-          Deep learning framework by the <a class="header name" href="http://bvlc.eecs.berkeley.edu/">BVLC</a>
+          Deep learning framework by <a class="header name" href="http://bair.berkeley.edu/">BAIR</a>
         </p>
         <p class="header">
           Created by
diff --git a/docs/development.md b/docs/development.md
index 107c2c3b281..ec05bbee102 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -4,7 +4,7 @@ title: Developing and Contributing
 # Development and Contributing
 
 Caffe is developed with active participation of the community.<br>
-The [BVLC](http://bvlc.eecs.berkeley.edu/) brewers welcome all contributions!
+The [BAIR](http://bair.berkeley.edu/)/BVLC brewers welcome all contributions!
 
 The exact details of contributions are recorded by versioning and cited in our [acknowledgements](http://caffe.berkeleyvision.org/#acknowledgements).
 This method is impartial and always up-to-date.
@@ -37,7 +37,7 @@ We absolutely appreciate any contribution to this effort!
 
 The `master` branch receives all new development including community contributions.
 We try to keep it in a reliable state, but it is the bleeding edge, and things do get broken every now and then.
-BVLC maintainers will periodically make releases by marking stable checkpoints as tags and maintenance branches. [Past releases](https://github.com/BVLC/caffe/releases) are catalogued online.
+BAIR maintainers will periodically make releases by marking stable checkpoints as tags and maintenance branches. [Past releases](https://github.com/BVLC/caffe/releases) are catalogued online.
 
 #### Issues & Pull Request Protocol
 
diff --git a/docs/index.md b/docs/index.md
index 932b3b58d1d..302a7d56f88 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -5,7 +5,7 @@ title: Deep Learning Framework
 # Caffe
 
 Caffe is a deep learning framework made with expression, speed, and modularity in mind.
-It is developed by the Berkeley Vision and Learning Center ([BVLC](http://bvlc.eecs.berkeley.edu)) and by community contributors.
+It is developed by Berkeley AI Research ([BAIR](http://bair.berkeley.edu)) and by community contributors.
 [Yangqing Jia](http://daggerfs.com) created the project during his PhD at UC Berkeley.
 Caffe is released under the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE).
 
@@ -45,7 +45,7 @@ A 4-page report for the ACM Multimedia Open Source competition (arXiv:1408.5093v
 - [Installation instructions](/installation.html)<br>
 Tested on Ubuntu, Red Hat, OS X.
 * [Model Zoo](/model_zoo.html)<br>
-BVLC suggests a standard distribution format for Caffe models, and provides trained models.
+BAIR suggests a standard distribution format for Caffe models, and provides trained models.
 * [Developing & Contributing](/development.html)<br>
 Guidelines for development and contributing to Caffe.
 * [API Documentation](/doxygen/annotated.html)<br>
@@ -92,9 +92,9 @@ The core Caffe developers offer [consulting services](mailto:caffe-coldpress@goo
 
 ## Acknowledgements
 
-The BVLC Caffe developers would like to thank NVIDIA for GPU donation, A9 and Amazon Web Services for a research grant in support of Caffe development and reproducible research in deep learning, and BVLC PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for guidance.
+The BAIR Caffe developers would like to thank NVIDIA for GPU donation, A9 and Amazon Web Services for a research grant in support of Caffe development and reproducible research in deep learning, and BAIR PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for guidance.
 
-The BVLC members who have contributed to Caffe are (alphabetical by first name):
+The BAIR members who have contributed to Caffe are (alphabetical by first name):
 [Eric Tzeng](https://github.com/erictzeng), [Evan Shelhamer](http://imaginarynumber.net/), [Jeff Donahue](http://jeffdonahue.com/), [Jon Long](https://github.com/longjon), [Ross Girshick](http://www.cs.berkeley.edu/~rbg/), [Sergey Karayev](http://sergeykarayev.com/), [Sergio Guadarrama](http://www.eecs.berkeley.edu/~sguada/), and [Yangqing Jia](http://daggerfs.com/).
 
 The open-source community plays an important and growing role in Caffe's development.
@@ -103,4 +103,4 @@ Check out the Github [project pulse](https://github.com/BVLC/caffe/pulse) for re
 We sincerely appreciate your interest and contributions!
 If you'd like to contribute, please read the [developing & contributing](development.html) guide.
 
-Yangqing would like to give a personal thanks to the NVIDIA Academic program for providing GPUs, [Oriol Vinyals](http://www1.icsi.berkeley.edu/~vinyals/) for discussions along the journey, and BVLC PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for advice.
+Yangqing would like to give a personal thanks to the NVIDIA Academic program for providing GPUs, [Oriol Vinyals](http://www1.icsi.berkeley.edu/~vinyals/) for discussions along the journey, and BAIR PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for advice.
diff --git a/docs/model_zoo.md b/docs/model_zoo.md
index 06dc0a49ec7..f9078718a8b 100644
--- a/docs/model_zoo.md
+++ b/docs/model_zoo.md
@@ -14,15 +14,15 @@ To help share these models, we introduce the model zoo framework:
 
 ## Where to get trained models
 
-First of all, we bundle BVLC-trained models for unrestricted, out of the box use.
+First of all, we bundle BAIR-trained models for unrestricted, out of the box use.
 <br>
-See the [BVLC model license](#bvlc-model-license) for details.
+See the [BAIR model license](#bair-model-license) for details.
 Each one of these can be downloaded by running `scripts/download_model_binary.py <dirname>` where `<dirname>` is specified below:
 
-- **BVLC Reference CaffeNet** in `models/bvlc_reference_caffenet`: AlexNet trained on ILSVRC 2012, with a minor variation from the version as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Jeff Donahue @jeffdonahue)
-- **BVLC AlexNet** in `models/bvlc_alexnet`: AlexNet trained on ILSVRC 2012, almost exactly as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Evan Shelhamer @shelhamer)
-- **BVLC Reference R-CNN ILSVRC-2013** in `models/bvlc_reference_rcnn_ilsvrc13`: pure Caffe implementation of [R-CNN](https://github.com/rbgirshick/rcnn) as described by Girshick et al. in CVPR 2014. (Trained by Ross Girshick @rbgirshick)
-- **BVLC GoogLeNet** in `models/bvlc_googlenet`: GoogLeNet trained on ILSVRC 2012, almost exactly as described in [Going Deeper with Convolutions](http://arxiv.org/abs/1409.4842) by Szegedy et al. in ILSVRC 2014. (Trained by Sergio Guadarrama @sguada)
+- **BAIR Reference CaffeNet** in `models/bvlc_reference_caffenet`: AlexNet trained on ILSVRC 2012, with a minor variation from the version as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Jeff Donahue @jeffdonahue)
+- **BAIR AlexNet** in `models/bvlc_alexnet`: AlexNet trained on ILSVRC 2012, almost exactly as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Evan Shelhamer @shelhamer)
+- **BAIR Reference R-CNN ILSVRC-2013** in `models/bvlc_reference_rcnn_ilsvrc13`: pure Caffe implementation of [R-CNN](https://github.com/rbgirshick/rcnn) as described by Girshick et al. in CVPR 2014. (Trained by Ross Girshick @rbgirshick)
+- **BAIR GoogLeNet** in `models/bvlc_googlenet`: GoogLeNet trained on ILSVRC 2012, almost exactly as described in [Going Deeper with Convolutions](http://arxiv.org/abs/1409.4842) by Szegedy et al. in ILSVRC 2014. (Trained by Sergio Guadarrama @sguada)
 
 **Community models** made by Caffe users are posted to a publicly editable [wiki page](https://github.com/BVLC/caffe/wiki/Model-Zoo).
 These models are subject to conditions of their respective authors such as citation and license.
@@ -55,14 +55,14 @@ Downloading model info is done just as easily with `scripts/download_model_from_
 ### Hosting trained models
 
 It is up to the user where to host the `.caffemodel` file.
-We host our BVLC-provided models on our own server.
+We host our BAIR-provided models on our own server.
 Dropbox also works fine (tip: make sure that `?dl=1` is appended to the end of the URL).
 
 `scripts/download_model_binary.py <dirname>` downloads the `.caffemodel` from the URL specified in the `<dirname>/readme.md` frontmatter and confirms SHA1.
 
-## BVLC model license
+## BAIR model license
 
-The Caffe models bundled by the BVLC are released for unrestricted use.
+The Caffe models bundled by the BAIR are released for unrestricted use.
 
 These models are trained on data from the [ImageNet project](http://www.image-net.org/) and training data includes internet photos that may be subject to copyright.
 
diff --git a/docs/multigpu.md b/docs/multigpu.md
index d91acef980d..e04ebb0b7c8 100644
--- a/docs/multigpu.md
+++ b/docs/multigpu.md
@@ -13,7 +13,7 @@ The GPUs to be used for training can be set with the "-gpu" flag on the command
 # Hardware Configuration Assumptions
 
 The current implementation uses a tree reduction strategy.  e.g. if there are 4 GPUs in the system, 0:1, 2:3 will exchange gradients, then 0:2 (top of the tree) will exchange gradients, 0 will calculate
-updated model, 0\-\>2, and then 0\-\>1, 2\-\>3. 
+updated model, 0\-\>2, and then 0\-\>1, 2\-\>3.
 
 For best performance, P2P DMA access between devices is needed. Without P2P access, for example crossing PCIe root complex, data is copied through host and effective exchange bandwidth is greatly reduced.
 
@@ -23,4 +23,4 @@ Current implementation has a "soft" assumption that the devices being used are h
 
 # Scaling Performance
 
-Performance is **heavily** dependent on the PCIe topology of the system, the configuration of the neural network you are training, and the speed of each of the layers.  Systems like the DIGITS DevBox have an optimized PCIe topology (X99-E WS chipset).  In general, scaling on 2 GPUs tends to be ~1.8X on average for networks like AlexNet, CaffeNet, VGG, GoogleNet.  4 GPUs begins to have falloff in scaling.  Generally with "weak scaling" where the batchsize increases with the number of GPUs you will see 3.5x scaling or so.  With "strong scaling", the system can become communication bound, especially with layer performance optimizations like those in [cuDNNv3](http://nvidia.com/cudnn), and you will likely see closer to mid 2.x scaling in performance.  Networks that have heavy computation compared to the number of parameters tend to have the best scaling performance.
\ No newline at end of file
+Performance is **heavily** dependent on the PCIe topology of the system, the configuration of the neural network you are training, and the speed of each of the layers.  Systems like the DIGITS DevBox have an optimized PCIe topology (X99-E WS chipset).  In general, scaling on 2 GPUs tends to be ~1.8X on average for networks like AlexNet, CaffeNet, VGG, GoogleNet.  4 GPUs begins to have falloff in scaling.  Generally with "weak scaling" where the batchsize increases with the number of GPUs you will see 3.5x scaling or so.  With "strong scaling", the system can become communication bound, especially with layer performance optimizations like those in [cuDNNv3](http://nvidia.com/cudnn), and you will likely see closer to mid 2.x scaling in performance.  Networks that have heavy computation compared to the number of parameters tend to have the best scaling performance.
diff --git a/docs/performance_hardware.md b/docs/performance_hardware.md
index cdd4b361dea..fbf256842f1 100644
--- a/docs/performance_hardware.md
+++ b/docs/performance_hardware.md
@@ -8,7 +8,7 @@ To measure performance on different NVIDIA GPUs we use CaffeNet, the Caffe refer
 
 For training, each time point is 20 iterations/minibatches of 256 images for 5,120 images total. For testing, a 50,000 image validation set is classified.
 
-**Acknowledgements**: BVLC members are very grateful to NVIDIA for providing several GPUs to conduct this research.
+**Acknowledgements**: BAIR members are very grateful to NVIDIA for providing several GPUs to conduct this research.
 
 ## NVIDIA K40
 
diff --git a/docs/tutorial/interfaces.md b/docs/tutorial/interfaces.md
index d7ff378239d..b5a4f1ad069 100644
--- a/docs/tutorial/interfaces.md
+++ b/docs/tutorial/interfaces.md
@@ -91,7 +91,7 @@ In MatCaffe, you can
 * Run for a certain number of iterations and give back control to Matlab
 * Intermingle arbitrary Matlab code with gradient steps
 
-An ILSVRC image classification demo is in caffe/matlab/demo/classification_demo.m (you need to download BVLC CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) to run it).
+An ILSVRC image classification demo is in caffe/matlab/demo/classification_demo.m (you need to download BAIR CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) to run it).
 
 ### Build MatCaffe
 
@@ -114,7 +114,7 @@ You can save your Matlab search PATH by running `savepath` so that you don't hav
 
 MatCaffe is very similar to PyCaffe in usage.
 
-Examples below shows detailed usages and assumes you have downloaded BVLC CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) and started `matlab` from caffe root folder.
+Examples below shows detailed usages and assumes you have downloaded BAIR CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) and started `matlab` from caffe root folder.
 
     model = './models/bvlc_reference_caffenet/deploy.prototxt';
     weights = './models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel';
diff --git a/examples/finetune_flickr_style/readme.md b/examples/finetune_flickr_style/readme.md
index 188dedf1b9a..dacfd01c8e1 100644
--- a/examples/finetune_flickr_style/readme.md
+++ b/examples/finetune_flickr_style/readme.md
@@ -9,7 +9,7 @@ priority: 5
 # Fine-tuning CaffeNet for Style Recognition on "Flickr Style" Data
 
 Fine-tuning takes an already learned model, adapts the architecture, and resumes training from the already learned model weights.
-Let's fine-tune the BVLC-distributed CaffeNet model on a different dataset, [Flickr Style](http://sergeykarayev.com/files/1311.3715v3.pdf), to predict image style instead of object category.
+Let's fine-tune the BAIR-distributed CaffeNet model on a different dataset, [Flickr Style](http://sergeykarayev.com/files/1311.3715v3.pdf), to predict image style instead of object category.
 
 ## Explanation
 
diff --git a/models/bvlc_alexnet/readme.md b/models/bvlc_alexnet/readme.md
index 008d690f7f4..a83e3d4e27c 100644
--- a/models/bvlc_alexnet/readme.md
+++ b/models/bvlc_alexnet/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC AlexNet Model
+name: BAIR/BVLC AlexNet Model
 caffemodel: bvlc_alexnet.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel
 license: unrestricted
diff --git a/models/bvlc_googlenet/readme.md b/models/bvlc_googlenet/readme.md
index 061b6d74530..ef04db62ab2 100644
--- a/models/bvlc_googlenet/readme.md
+++ b/models/bvlc_googlenet/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC GoogleNet Model
+name: BAIR/BVLC GoogleNet Model
 caffemodel: bvlc_googlenet.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel
 license: unrestricted
diff --git a/models/bvlc_reference_caffenet/readme.md b/models/bvlc_reference_caffenet/readme.md
index 671e47a5056..5352e536a07 100644
--- a/models/bvlc_reference_caffenet/readme.md
+++ b/models/bvlc_reference_caffenet/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC CaffeNet Model
+name: BAIR/BVLC CaffeNet Model
 caffemodel: bvlc_reference_caffenet.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_reference_caffenet.caffemodel
 license: unrestricted
diff --git a/models/bvlc_reference_rcnn_ilsvrc13/readme.md b/models/bvlc_reference_rcnn_ilsvrc13/readme.md
index 9a11a24d8f8..12543b2bd2c 100644
--- a/models/bvlc_reference_rcnn_ilsvrc13/readme.md
+++ b/models/bvlc_reference_rcnn_ilsvrc13/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC Reference RCNN ILSVRC13 Model
+name: BAIR/BVLC Reference RCNN ILSVRC13 Model
 caffemodel: bvlc_reference_rcnn_ilsvrc13.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_reference_rcnn_ilsvrc13.caffemodel
 license: unrestricted

From 3562698afb4b1f12f51eca752740e279f85714c4 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 12:45:21 -0700
Subject: [PATCH 057/144] drop performance + hardware page and switch to sheet

simpler to read and update
---
 docs/index.md                |  9 +++--
 docs/performance_hardware.md | 73 ------------------------------------
 2 files changed, 5 insertions(+), 77 deletions(-)
 delete mode 100644 docs/performance_hardware.md

diff --git a/docs/index.md b/docs/index.md
index 302a7d56f88..bbfd91fc7b9 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -23,15 +23,14 @@ Thanks to these contributors the framework tracks the state-of-the-art in both c
 
 **Speed** makes Caffe perfect for research experiments and industry deployment.
 Caffe can process **over 60M images per day** with a single NVIDIA K40 GPU\*.
-That's 1 ms/image for inference and 4 ms/image for learning.
-We believe that Caffe is the fastest convnet implementation available.
+That's 1 ms/image for inference and 4 ms/image for learning and more recent library versions and hardware are faster still.
+We believe that Caffe is among the fastest convnet implementations available.
 
 **Community**: Caffe already powers academic research projects, startup prototypes, and even large-scale industrial applications in vision, speech, and multimedia.
 Join our community of brewers on the [caffe-users group](https://groups.google.com/forum/#!forum/caffe-users) and [Github](https://github.com/BVLC/caffe/).
 
 <p class="footnote" markdown="1">
-\* With the ILSVRC2012-winning [SuperVision](http://www.image-net.org/challenges/LSVRC/2012/supervision.pdf) model and caching IO.
-Consult performance [details](/performance_hardware.html).
+\* With the ILSVRC2012-winning [SuperVision](http://www.image-net.org/challenges/LSVRC/2012/supervision.pdf) model and prefetching IO.
 </p>
 
 ## Documentation
@@ -50,6 +49,8 @@ BAIR suggests a standard distribution format for Caffe models, and provides trai
 Guidelines for development and contributing to Caffe.
 * [API Documentation](/doxygen/annotated.html)<br>
 Developer documentation automagically generated from code comments.
+* [Benchmarking](https://docs.google.com/spreadsheets/d/1Yp4rqHpT7mKxOPbpzYeUfEFLnELDAgxSSBQKp5uKDGQ/edit#gid=0)<br>
+Comparison of inference and learning for different networks and GPUs.
 
 ### Examples
 
diff --git a/docs/performance_hardware.md b/docs/performance_hardware.md
deleted file mode 100644
index fbf256842f1..00000000000
--- a/docs/performance_hardware.md
+++ /dev/null
@@ -1,73 +0,0 @@
----
-title: Performance and Hardware Configuration
----
-
-# Performance and Hardware Configuration
-
-To measure performance on different NVIDIA GPUs we use CaffeNet, the Caffe reference ImageNet model.
-
-For training, each time point is 20 iterations/minibatches of 256 images for 5,120 images total. For testing, a 50,000 image validation set is classified.
-
-**Acknowledgements**: BAIR members are very grateful to NVIDIA for providing several GPUs to conduct this research.
-
-## NVIDIA K40
-
-Performance is best with ECC off and boost clock enabled. While ECC makes a negligible difference in speed, disabling it frees ~1 GB of GPU memory.
-
-Best settings with ECC off and maximum clock speed in standard Caffe:
-
-* Training is 26.5 secs / 20 iterations (5,120 images)
-* Testing is 100 secs / validation set (50,000 images)
-
-Best settings with Caffe + [cuDNN acceleration](http://nvidia.com/cudnn):
-
-* Training is 19.2 secs / 20 iterations (5,120 images)
-* Testing is 60.7 secs / validation set (50,000 images)
-
-Other settings:
-
-* ECC on, max speed: training 26.7 secs / 20 iterations, test 101 secs / validation set
-* ECC on, default speed: training 31 secs / 20 iterations, test 117 secs / validation set
-* ECC off, default speed: training 31 secs / 20 iterations, test 118 secs / validation set
-
-### K40 configuration tips
-
-For maximum K40 performance, turn off ECC and boost the clock speed (at your own risk).
-
-To turn off ECC, do
-
-    sudo nvidia-smi -i 0 --ecc-config=0    # repeat with -i x for each GPU ID
-
-then reboot.
-
-Set the "persistence" mode of the GPU settings by
-
-    sudo nvidia-smi -pm 1
-
-and then set the clock speed with
-
-    sudo nvidia-smi -i 0 -ac 3004,875    # repeat with -i x for each GPU ID
-
-but note that this configuration resets across driver reloading / rebooting. Include these commands in a boot script to initialize these settings. For a simple fix, add these commands to `/etc/rc.local` (on Ubuntu).
-
-## NVIDIA Titan
-
-Training: 26.26 secs / 20 iterations (5,120 images).
-Testing: 100 secs / validation set (50,000 images).
-
-cuDNN Training: 20.25 secs / 20 iterations (5,120 images).
-cuDNN Testing: 66.3 secs / validation set (50,000 images).
-
-
-## NVIDIA K20
-
-Training: 36.0 secs / 20 iterations (5,120 images).
-Testing: 133 secs / validation set (50,000 images).
-
-## NVIDIA GTX 770
-
-Training: 33.0 secs / 20 iterations (5,120 images).
-Testing: 129 secs / validation set (50,000 images).
-
-cuDNN Training: 24.3 secs / 20 iterations (5,120 images).
-cuDNN Testing: 104 secs / validation set (50,000 images).

From 0f5bfc34e0b37b9ab3437d6755eb04a8dc9e8656 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 12:46:56 -0700
Subject: [PATCH 058/144] favor notebook examples as more clear and popular

---
 docs/index.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index bbfd91fc7b9..82eb059e325 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -52,13 +52,6 @@ Developer documentation automagically generated from code comments.
 * [Benchmarking](https://docs.google.com/spreadsheets/d/1Yp4rqHpT7mKxOPbpzYeUfEFLnELDAgxSSBQKp5uKDGQ/edit#gid=0)<br>
 Comparison of inference and learning for different networks and GPUs.
 
-### Examples
-
-{% assign examples = site.pages | where:'category','example' | sort: 'priority' %}
-{% for page in examples %}
-- <div><a href="{{page.url}}">{{page.title}}</a><br>{{page.description}}</div>
-{% endfor %}
-
 ### Notebook Examples
 
 {% assign notebooks = site.pages | where:'category','notebook' | sort: 'priority' %}
@@ -66,6 +59,13 @@ Comparison of inference and learning for different networks and GPUs.
 - <div><a href="http://nbviewer.ipython.org/github/BVLC/caffe/blob/master/{{page.original_path}}">{{page.title}}</a><br>{{page.description}}</div>
 {% endfor %}
 
+### Command Line Examples
+
+{% assign examples = site.pages | where:'category','example' | sort: 'priority' %}
+{% for page in examples %}
+- <div><a href="{{page.url}}">{{page.title}}</a><br>{{page.description}}</div>
+{% endfor %}
+
 ## Citing Caffe
 
 Please cite Caffe in your publications if it helps your research:

From 2158bbb2151049dec2486b720c0a351164a0eb6b Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 12:50:19 -0700
Subject: [PATCH 059/144] model zoo: point out wiki link immediately, explain
 manual editing

---
 docs/model_zoo.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/model_zoo.md b/docs/model_zoo.md
index f9078718a8b..3f77e82572c 100644
--- a/docs/model_zoo.md
+++ b/docs/model_zoo.md
@@ -3,7 +3,7 @@ title: Model Zoo
 ---
 # Caffe Model Zoo
 
-Lots of researchers and engineers have made Caffe models for different tasks with all kinds of architectures and data.
+Lots of researchers and engineers have made Caffe models for different tasks with all kinds of architectures and data: check out the [model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo)!
 These models are learned and applied for problems ranging from simple regression, to large-scale visual classification, to Siamese networks for image similarity, to speech and robotics applications.
 
 To help share these models, we introduce the model zoo framework:
@@ -24,7 +24,7 @@ Each one of these can be downloaded by running `scripts/download_model_binary.py
 - **BAIR Reference R-CNN ILSVRC-2013** in `models/bvlc_reference_rcnn_ilsvrc13`: pure Caffe implementation of [R-CNN](https://github.com/rbgirshick/rcnn) as described by Girshick et al. in CVPR 2014. (Trained by Ross Girshick @rbgirshick)
 - **BAIR GoogLeNet** in `models/bvlc_googlenet`: GoogLeNet trained on ILSVRC 2012, almost exactly as described in [Going Deeper with Convolutions](http://arxiv.org/abs/1409.4842) by Szegedy et al. in ILSVRC 2014. (Trained by Sergio Guadarrama @sguada)
 
-**Community models** made by Caffe users are posted to a publicly editable [wiki page](https://github.com/BVLC/caffe/wiki/Model-Zoo).
+**Community models** made by Caffe users are posted to a publicly editable [model zoo wiki page](https://github.com/BVLC/caffe/wiki/Model-Zoo).
 These models are subject to conditions of their respective authors such as citation and license.
 Thank you for sharing your models!
 
@@ -42,6 +42,8 @@ A caffe model is distributed as a directory containing:
     - License information.
 - [optional] Other helpful scripts.
 
+This simple format can be handled through bundled scripts or manually if need be.
+
 ### Hosting model info
 
 Github Gist is a good format for model info distribution because it can contain multiple files, is versionable, and has in-browser syntax highlighting and markdown rendering.

From 414b74c06038c17924745b68954ef10827fe1edd Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 13:19:53 -0700
Subject: [PATCH 060/144] add missing names to BAIR roster

---
 docs/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/index.md b/docs/index.md
index 82eb059e325..db8eaffbe34 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -96,7 +96,7 @@ The core Caffe developers offer [consulting services](mailto:caffe-coldpress@goo
 The BAIR Caffe developers would like to thank NVIDIA for GPU donation, A9 and Amazon Web Services for a research grant in support of Caffe development and reproducible research in deep learning, and BAIR PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for guidance.
 
 The BAIR members who have contributed to Caffe are (alphabetical by first name):
-[Eric Tzeng](https://github.com/erictzeng), [Evan Shelhamer](http://imaginarynumber.net/), [Jeff Donahue](http://jeffdonahue.com/), [Jon Long](https://github.com/longjon), [Ross Girshick](http://www.cs.berkeley.edu/~rbg/), [Sergey Karayev](http://sergeykarayev.com/), [Sergio Guadarrama](http://www.eecs.berkeley.edu/~sguada/), and [Yangqing Jia](http://daggerfs.com/).
+[Carl Doersch](http://www.carldoersch.com/), [Eric Tzeng](https://github.com/erictzeng), [Evan Shelhamer](http://imaginarynumber.net/), [Jeff Donahue](http://jeffdonahue.com/), [Jon Long](https://github.com/longjon), [Philipp Krähenbühl](http://www.philkr.net/), [Ronghang Hu](http://ronghanghu.com/), [Ross Girshick](http://www.cs.berkeley.edu/~rbg/), [Sergey Karayev](http://sergeykarayev.com/), [Sergio Guadarrama](http://www.eecs.berkeley.edu/~sguada/), [Takuya Narihira](https://github.com/tnarihi), and [Yangqing Jia](http://daggerfs.com/).
 
 The open-source community plays an important and growing role in Caffe's development.
 Check out the Github [project pulse](https://github.com/BVLC/caffe/pulse) for recent activity and the [contributors](https://github.com/BVLC/caffe/graphs/contributors) for the full list.

From e90a6a6ca29423afb15f39adb1157bff9e6f8655 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 13:24:30 -0700
Subject: [PATCH 061/144] retire caffe-dev and caffe-coldpress

dev has diffused into the community from the original Caffe core
---
 docs/index.md | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index db8eaffbe34..0e21ae821b0 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -86,11 +86,6 @@ Join the [caffe-users group](https://groups.google.com/forum/#!forum/caffe-users
 
 Framework development discussions and thorough bug reports are collected on [Issues](https://github.com/BVLC/caffe/issues).
 
-Contact [caffe-dev](mailto:caffe-dev@googlegroups.com) if you have a confidential proposal for the framework *and the ability to act on it*.
-Requests for features, explanations, or personal help will be ignored; post to [caffe-users](https://groups.google.com/forum/#!forum/caffe-users) instead.
-
-The core Caffe developers offer [consulting services](mailto:caffe-coldpress@googlegroups.com) for appropriate projects.
-
 ## Acknowledgements
 
 The BAIR Caffe developers would like to thank NVIDIA for GPU donation, A9 and Amazon Web Services for a research grant in support of Caffe development and reproducible research in deep learning, and BAIR PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for guidance.

From 8985818e4fbb5fc207e4f383c63c28d80fd286f2 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 13:28:24 -0700
Subject: [PATCH 062/144] track publications by google scholar and not the wiki

---
 docs/index.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index 0e21ae821b0..3385747c565 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -77,8 +77,7 @@ Please cite Caffe in your publications if it helps your research:
       Year = {2014}
     }
 
-If you do publish a paper where Caffe helped your research, we encourage you to update the [publications wiki](https://github.com/BVLC/caffe/wiki/Publications).
-Citations are also tracked automatically by [Google Scholar](http://scholar.google.com/scholar?oi=bibs&hl=en&cites=17333247995453974016).
+If you do publish a paper where Caffe helped your research, we encourage you to cite the framework for tracking by [Google Scholar](https://scholar.google.com/citations?view_op=view_citation&hl=en&citation_for_view=-ltRSM0AAAAJ:u5HHmVD_uO8C).
 
 ## Contacting Us
 

From 8b8f2dd40ba87543f066cb157c6d65dd8187253f Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 15:26:30 -0700
Subject: [PATCH 063/144] link to new full-day crash course

---
 docs/index.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index 3385747c565..b633f7cfddc 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -35,8 +35,8 @@ Join our community of brewers on the [caffe-users group](https://groups.google.c
 
 ## Documentation
 
-- [DIY Deep Learning for Vision with Caffe](https://docs.google.com/presentation/d/1UeKXVgRvvxg9OUdh_UiC5G71UMscNPlvArsWER41PsU/edit#slide=id.p)<br>
-Tutorial presentation.
+- [DIY Deep Learning for Vision with Caffe](https://docs.google.com/presentation/d/1UeKXVgRvvxg9OUdh_UiC5G71UMscNPlvArsWER41PsU/edit#slide=id.p) and [Caffe in a Day](https://docs.google.com/presentation/d/1HxGdeq8MPktHaPb-rlmYYQ723iWzq9ur6Gjo71YiG0Y/edit#slide=id.gc2fcdcce7_216_0)<br>
+Tutorial presentation of the framework and a full-day crash course.
 - [Tutorial Documentation](/tutorial)<br>
 Practical guide and framework reference.
 - [arXiv / ACM MM '14 paper](http://arxiv.org/abs/1408.5093)<br>

From 49761d34d18b7063af995b13ecca0fee1bdaf02c Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Fri, 14 Apr 2017 15:32:50 -0700
Subject: [PATCH 064/144] Caffe 1.0

---
 CMakeLists.txt | 4 ++--
 Makefile       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c52ff466471..08f56a33a59 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,8 +10,8 @@ endif()
 project(Caffe C CXX)
 
 # ---[ Caffe version
-set(CAFFE_TARGET_VERSION "1.0.0-rc5" CACHE STRING "Caffe logical version")
-set(CAFFE_TARGET_SOVERSION "1.0.0-rc5" CACHE STRING "Caffe soname version")
+set(CAFFE_TARGET_VERSION "1.0.0" CACHE STRING "Caffe logical version")
+set(CAFFE_TARGET_SOVERSION "1.0.0" CACHE STRING "Caffe soname version")
 add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})
 
 # ---[ Using cmake scripts and modules
diff --git a/Makefile b/Makefile
index 77900b69b97..4d324160c08 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,7 @@ LIB_BUILD_DIR := $(BUILD_DIR)/lib
 STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a
 DYNAMIC_VERSION_MAJOR 		:= 1
 DYNAMIC_VERSION_MINOR 		:= 0
-DYNAMIC_VERSION_REVISION 	:= 0-rc5
+DYNAMIC_VERSION_REVISION 	:= 0
 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
 #DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR)
 DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION)

From fb0795cd325cee991073b3b02c280e04b62790ba Mon Sep 17 00:00:00 2001
From: Bruno Bowden <github@brunobowden.com>
Date: Mon, 17 Apr 2017 16:15:52 -0700
Subject: [PATCH 065/144] Shape mismatch CHECK logging improvements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- More logs when shapes mismatch
- Manual reviewed all matches of “grep -r CHECK.*shape src”
- Follow JUnit convention of:
```
  CHECK(expected == actual)
      << “expected: “ << expected
      << “ vs. actual: ” << actual;
```

Layers:
- Recurrent
- BaseConvolutionLayer
- Slice (improves logging for incorrect parameter)

CHECK_EQ
- Would prefer to use CHECK_EQ(expected, actual):
https://github.com/google/glog/blob/master/src/glog/stl_logging.h.in#L36
- However this requires include of stl_logging.h which introduces a
build warning. I believe this would work if caffe was updated to the
latest version of glog but that's a bigger change.
---
 src/caffe/layers/base_conv_layer.cpp | 4 +++-
 src/caffe/layers/recurrent_layer.cpp | 5 +++--
 src/caffe/layers/slice_layer.cpp     | 4 +++-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 35c90145e31..aff0a7548c7 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -193,7 +193,9 @@ void BaseConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   // TODO: generalize to handle inputs of different shapes.
   for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
     CHECK(bottom[0]->shape() == bottom[bottom_id]->shape())
-        << "All inputs must have the same shape.";
+        << "shape mismatch - bottom[0]: " << bottom[0]->shape_string()
+        << " vs. bottom[" << bottom_id << "]: "
+        << bottom[bottom_id]->shape_string();
   }
   // Shape the tops.
   bottom_shape_ = &bottom[0]->shape();
diff --git a/src/caffe/layers/recurrent_layer.cpp b/src/caffe/layers/recurrent_layer.cpp
index e0c82773392..9cd3206f66a 100644
--- a/src/caffe/layers/recurrent_layer.cpp
+++ b/src/caffe/layers/recurrent_layer.cpp
@@ -214,8 +214,9 @@ void RecurrentLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     const int bottom_offset = 2 + static_input_;
     for (int i = bottom_offset, j = 0; i < bottom.size(); ++i, ++j) {
       CHECK(recur_input_blobs_[j]->shape() == bottom[i]->shape())
-          << "bottom[" << i << "] shape must match hidden state input shape: "
-          << recur_input_blobs_[j]->shape_string();
+          << "shape mismatch - recur_input_blobs_[" << j << "]: "
+          << recur_input_blobs_[j]->shape_string()
+          << " vs. bottom[" << i << "]: " << bottom[i]->shape_string();
       recur_input_blobs_[j]->ShareData(*bottom[i]);
     }
   }
diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp
index 759beafe0d9..64de0964483 100644
--- a/src/caffe/layers/slice_layer.cpp
+++ b/src/caffe/layers/slice_layer.cpp
@@ -41,7 +41,9 @@ void SliceLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   int count = 0;
   if (slice_point_.size() != 0) {
     CHECK_EQ(slice_point_.size(), top.size() - 1);
-    CHECK_LE(top.size(), bottom_slice_axis);
+    CHECK_LE(top.size(), bottom_slice_axis)
+        << "slice axis: " << slice_axis_
+        << ", bottom[0] shape: " << bottom[0]->shape_string();
     int prev = 0;
     vector<int> slices;
     for (int i = 0; i < slice_point_.size(); ++i) {

From 33f86122970392fcda19ef80ed5cd349279b896d Mon Sep 17 00:00:00 2001
From: Eric Tzeng <etzeng@pinterest.com>
Date: Tue, 18 Apr 2017 18:22:38 -0700
Subject: [PATCH 066/144] Rewrite crop cuda kernel

---
 include/caffe/layers/crop_layer.hpp |   6 +-
 src/caffe/layers/crop_layer.cpp     |  21 +++--
 src/caffe/layers/crop_layer.cu      | 122 +++++++++++-----------------
 3 files changed, 69 insertions(+), 80 deletions(-)

diff --git a/include/caffe/layers/crop_layer.hpp b/include/caffe/layers/crop_layer.hpp
index c4fda1220c3..5219fa5cb5f 100644
--- a/include/caffe/layers/crop_layer.hpp
+++ b/include/caffe/layers/crop_layer.hpp
@@ -41,13 +41,15 @@ class CropLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
-  vector<int> offsets;
+  Blob<int> offsets;
+  Blob<int> src_strides_;
+  Blob<int> dest_strides_;
 
  private:
   // Recursive copy function.
   void crop_copy(const vector<Blob<Dtype>*>& bottom,
                const vector<Blob<Dtype>*>& top,
-               const vector<int>& offsets,
+               const int* offsets,
                vector<int> indices,
                int cur_dim,
                const Dtype* src_data,
diff --git a/src/caffe/layers/crop_layer.cpp b/src/caffe/layers/crop_layer.cpp
index ef8c177c4dd..65ea8f8b7d0 100644
--- a/src/caffe/layers/crop_layer.cpp
+++ b/src/caffe/layers/crop_layer.cpp
@@ -40,8 +40,10 @@ void CropLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   const int start_axis = bottom[0]->CanonicalAxisIndex(param.axis());
 
   // Initialize offsets to 0 and the new shape to the current shape of the data.
-  offsets = vector<int>(input_dim, 0);
   vector<int> new_shape(bottom[0]->shape());
+  vector<int> offsets_shape(1, input_dim);
+  offsets.Reshape(offsets_shape);
+  int* offset_data = offsets.mutable_cpu_data();
 
   // Determine crop offsets and the new shape post-crop.
   for (int i = 0; i < input_dim; ++i) {
@@ -63,15 +65,22 @@ void CropLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
           << "size " << bottom[1]->shape(i) << " and offset " << crop_offset;
     }
     new_shape[i] = new_size;
-    offsets[i] = crop_offset;
+    offset_data[i] = crop_offset;
   }
   top[0]->Reshape(new_shape);
+  // Compute strides
+  src_strides_.Reshape(offsets_shape);
+  dest_strides_.Reshape(offsets_shape);
+  for (int i = 0; i < input_dim; ++i) {
+    src_strides_.mutable_cpu_data()[i] = bottom[0]->count(i + 1, input_dim);
+    dest_strides_.mutable_cpu_data()[i] = top[0]->count(i + 1, input_dim);
+  }
 }
 
 template <typename Dtype>
 void CropLayer<Dtype>::crop_copy(const vector<Blob<Dtype>*>& bottom,
              const vector<Blob<Dtype>*>& top,
-             const vector<int>& offsets,
+             const int* offsets,
              vector<int> indices,
              int cur_dim,
              const Dtype* src_data,
@@ -115,7 +124,8 @@ void CropLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   std::vector<int> indices(top[0]->num_axes(), 0);
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
-  crop_copy(bottom, top, offsets, indices, 0, bottom_data, top_data, true);
+  crop_copy(bottom, top, offsets.cpu_data(), indices, 0, bottom_data, top_data,
+      true);
 }
 
 template <typename Dtype>
@@ -127,7 +137,8 @@ void CropLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   if (propagate_down[0]) {
     caffe_set(bottom[0]->count(), static_cast<Dtype>(0), bottom_diff);
     std::vector<int> indices(top[0]->num_axes(), 0);
-    crop_copy(bottom, top, offsets, indices, 0, top_diff, bottom_diff, false);
+    crop_copy(bottom, top, offsets.cpu_data(), indices, 0, top_diff,
+        bottom_diff, false);
   }
 }
 
diff --git a/src/caffe/layers/crop_layer.cu b/src/caffe/layers/crop_layer.cu
index 677077cdd8b..a400f333e14 100644
--- a/src/caffe/layers/crop_layer.cu
+++ b/src/caffe/layers/crop_layer.cu
@@ -4,90 +4,62 @@
 
 namespace caffe {
 
-// Copy (one line per thread) from one array to another, with arbitrary
-// strides in the last two dimensions.
+__device__ int compute_uncropped_index(
+    int index,
+    const int ndims,
+    const int* src_strides,
+    const int* dest_strides,
+    const int* offsets) {
+  int dest_index = index;
+  int src_index = 0;
+  for (int i = 0; i < ndims; ++i) {
+      int coord = dest_index / dest_strides[i];
+      dest_index -= coord * dest_strides[i];
+      src_index += src_strides[i] * (coord + offsets[i]);
+  }
+  return src_index;
+}
+
 template <typename Dtype>
-__global__ void copy_kernel(const int n, const int height, const int width,
-    const int src_inner_stride,
-    const int dest_inner_stride,
+__global__ void crop_kernel_forward(const int nthreads,
+    const int ndims,
+    const int* src_strides,
+    const int* dest_strides,
+    const int* offsets,
     const Dtype* src, Dtype* dest) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int src_start = index * src_inner_stride;
-    int dest_start = index * dest_inner_stride;
-    for (int i = 0; i < width; ++i) {
-      dest[dest_start + i] = src[src_start + i];
-    }
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int src_index = compute_uncropped_index(
+        index, ndims, src_strides, dest_strides, offsets);
+    dest[index] = src[src_index];
   }
 }
 
 template <typename Dtype>
-void CropLayer<Dtype>::crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
-             const vector<Blob<Dtype>*>& top,
-             const vector<int>& offsets,
-             vector<int> indices,
-             int cur_dim,
-             const Dtype* src_data,
-             Dtype* dest_data,
-             bool is_forward) {
-  if (cur_dim + 2 < top[0]->num_axes()) {
-    // We are not yet at the final dimension, call copy recursivley
-    for (int i = 0; i < top[0]->shape(cur_dim); ++i) {
-      indices[cur_dim] = i;
-      crop_copy_gpu(bottom, top, offsets, indices, cur_dim+1,
-                src_data, dest_data, is_forward);
-    }
-  } else {
-    // We are at the last two dimensions, which are stored continuously in
-    // memory. With (N,C,H,W)
-    //              (0,1,2,3) cur_dim   -> H
-    //                        cur_dim+1 -> W
-    const int lines = top[0]->shape(cur_dim);
-    const int height = top[0]->shape(cur_dim);
-    const int width = top[0]->shape(cur_dim+1);
-    std::vector<int> ind_off(cur_dim+2, 0);
-    for (int j = 0; j < cur_dim; ++j) {
-        ind_off[j] = indices[j] + offsets[j];
-    }
-    ind_off[cur_dim] = offsets[cur_dim];
-    ind_off[cur_dim+1] = offsets[cur_dim+1];
-    // Compute copy strides
-    const int src_inner_stride = bottom[0]->shape(cur_dim+1);
-    const int dest_inner_stride = top[0]->shape(cur_dim+1);
-
-    if (is_forward) {
-      const Dtype* bottom_data = bottom[0]->gpu_data() +
-          bottom[0]->offset(ind_off);
-      Dtype* top_data = top[0]->mutable_gpu_data() +
-          top[0]->offset(indices);
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
-          lines, height, width,
-          src_inner_stride,
-          dest_inner_stride,
-          bottom_data, top_data);
-
-    } else {
-      const Dtype* top_diff = top[0]->gpu_diff() +
-          top[0]->offset(indices);
-      Dtype* bottom_diff = bottom[0]->mutable_gpu_diff() +
-          bottom[0]->offset(ind_off);
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
-          lines, height, width,
-          dest_inner_stride,
-          src_inner_stride,
-          top_diff, bottom_diff);
-    }
+__global__ void crop_kernel_backward(const int nthreads,
+    const int ndims,
+    const int* src_strides,
+    const int* dest_strides,
+    const int* offsets,
+    Dtype* src, const Dtype* dest) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int src_index = compute_uncropped_index(
+        index, ndims, src_strides, dest_strides, offsets);
+    src[src_index] = dest[index];
   }
 }
 
 template <typename Dtype>
 void CropLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  std::vector<int> indices(top[0]->num_axes(), 0);
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
-  crop_copy_gpu(bottom, top, offsets, indices, 0, bottom_data, top_data, true);
+  int n = top[0]->count();
+  crop_kernel_forward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
+      bottom[0]->num_axes(),
+      src_strides_.gpu_data(),
+      dest_strides_.gpu_data(),
+      offsets.gpu_data(),
+      bottom_data, top_data);
 }
 
 template <typename Dtype>
@@ -95,12 +67,16 @@ void CropLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* top_diff = top[0]->gpu_diff();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  int n = top[0]->count();
 
   if (propagate_down[0]) {
     caffe_gpu_set(bottom[0]->count(), static_cast<Dtype>(0), bottom_diff);
-    std::vector<int> indices(top[0]->num_axes(), 0);
-    crop_copy_gpu(bottom, top, offsets, indices, 0, top_diff, bottom_diff,
-                  false);
+    crop_kernel_backward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
+        bottom[0]->num_axes(),
+        src_strides_.gpu_data(),
+        dest_strides_.gpu_data(),
+        offsets.gpu_data(),
+        bottom_diff, top_diff);
   }
 }
 

From cd1696d00b995a1d8567cb6f3ad7f65ec4df4176 Mon Sep 17 00:00:00 2001
From: Eric Tzeng <etzeng@pinterest.com>
Date: Tue, 18 Apr 2017 18:48:26 -0700
Subject: [PATCH 067/144] Fix crop layer lint errors

---
 src/caffe/layers/crop_layer.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/caffe/layers/crop_layer.cu b/src/caffe/layers/crop_layer.cu
index a400f333e14..4ece9cd1761 100644
--- a/src/caffe/layers/crop_layer.cu
+++ b/src/caffe/layers/crop_layer.cu
@@ -54,6 +54,7 @@ void CropLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
   int n = top[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
   crop_kernel_forward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
       bottom[0]->num_axes(),
       src_strides_.gpu_data(),
@@ -71,6 +72,7 @@ void CropLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 
   if (propagate_down[0]) {
     caffe_gpu_set(bottom[0]->count(), static_cast<Dtype>(0), bottom_diff);
+    // NOLINT_NEXT_LINE(whitespace/operators)
     crop_kernel_backward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
         bottom[0]->num_axes(),
         src_strides_.gpu_data(),

From ec35395e131a0d5e7c55cbd74dadbd46a49a645c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Malte=20St=C3=A6r=20Nissen?= <nissen@di.ku.dk>
Date: Thu, 4 May 2017 14:33:40 +0200
Subject: [PATCH 068/144] Handling destruction of empty Net objects

---
 matlab/+caffe/Net.m | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/matlab/+caffe/Net.m b/matlab/+caffe/Net.m
index 349e060eb22..bb99ec89049 100644
--- a/matlab/+caffe/Net.m
+++ b/matlab/+caffe/Net.m
@@ -69,7 +69,9 @@
       self.blob_names = self.attributes.blob_names;
     end
     function delete (self)
-      caffe_('delete_net', self.hNet_self);
+      if ~isempty(self.hNet_self)
+        caffe_('delete_net', self.hNet_self);
+      end
     end
     function layer = layers(self, layer_name)
       CHECK(ischar(layer_name), 'layer_name must be a string');

From 0cb449e8c766499b29e5314120068ee9c8ebd71e Mon Sep 17 00:00:00 2001
From: Dave Brown <daveb@signalpop.com>
Date: Sun, 7 May 2017 18:25:27 -0700
Subject: [PATCH 069/144] Update euclidean_loss_layer.hpp

---
 include/caffe/layers/euclidean_loss_layer.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/caffe/layers/euclidean_loss_layer.hpp b/include/caffe/layers/euclidean_loss_layer.hpp
index f564569e27a..24568c5496f 100644
--- a/include/caffe/layers/euclidean_loss_layer.hpp
+++ b/include/caffe/layers/euclidean_loss_layer.hpp
@@ -30,7 +30,7 @@ namespace caffe {
  * This can be used for least-squares regression tasks.  An InnerProductLayer
  * input to a EuclideanLossLayer exactly formulates a linear least squares
  * regression problem. With non-zero weight decay the problem becomes one of
- * ridge regression -- see src/caffe/test/test_sgd_solver.cpp for a concrete
+ * ridge regression -- see src/caffe/test/test_gradient_based_solver.cpp for a concrete
  * example wherein we check that the gradients computed for a Net with exactly
  * this structure match hand-computed gradient formulas for ridge regression.
  *

From b7e2b99c7f0aeeb8e24046f8cbf5212065b9ccdf Mon Sep 17 00:00:00 2001
From: Luke Yeager <lukeyeager@users.noreply.github.com>
Date: Fri, 12 May 2017 10:06:51 -0700
Subject: [PATCH 070/144] Downgrade boost requirement from 1.55 to 1.54

---
 cmake/Dependencies.cmake       | 2 +-
 scripts/travis/install-deps.sh | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 4a5bac471b4..c48255c89f2 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -5,7 +5,7 @@ set(Caffe_DEFINITIONS "")
 set(Caffe_COMPILE_OPTIONS "")
 
 # ---[ Boost
-find_package(Boost 1.55 REQUIRED COMPONENTS system thread filesystem)
+find_package(Boost 1.54 REQUIRED COMPONENTS system thread filesystem)
 list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${Boost_INCLUDE_DIRS})
 list(APPEND Caffe_LINKER_LIBS PUBLIC ${Boost_LIBRARIES})
 
diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
index dac5d2f9d37..2fa2a74a486 100755
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@@ -9,10 +9,10 @@ apt-get -y update
 apt-get install -y --no-install-recommends \
   build-essential \
   graphviz \
-  libboost-filesystem1.55-dev \
-  libboost-python1.55-dev \
-  libboost-system1.55-dev \
-  libboost-thread1.55-dev \
+  libboost-filesystem-dev \
+  libboost-python-dev \
+  libboost-system-dev \
+  libboost-thread-dev \
   libgflags-dev \
   libgoogle-glog-dev \
   libhdf5-serial-dev \

From 30a2ab7e50430911f37ddf981e67e4f36f662f14 Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Mon, 15 May 2017 02:16:19 +0000
Subject: [PATCH 071/144] cmake: rename libproto.a -> libcaffeproto.a

---
 cmake/ConfigGen.cmake    |  2 +-
 src/caffe/CMakeLists.txt | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake
index ad91f542104..09bb09b4ff2 100644
--- a/cmake/ConfigGen.cmake
+++ b/cmake/ConfigGen.cmake
@@ -33,7 +33,7 @@ function(caffe_generate_export_configs)
   configure_file("cmake/Templates/CaffeConfig.cmake.in" "${PROJECT_BINARY_DIR}/CaffeConfig.cmake" @ONLY)
 
   # Add targets to the build-tree export set
-  export(TARGETS caffe proto FILE "${PROJECT_BINARY_DIR}/CaffeTargets.cmake")
+  export(TARGETS caffe caffeproto FILE "${PROJECT_BINARY_DIR}/CaffeTargets.cmake")
   export(PACKAGE Caffe)
 
   # ---[ Configure install-tree CaffeConfig.cmake file ]---
diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
index b9152e9216f..4a805568566 100644
--- a/src/caffe/CMakeLists.txt
+++ b/src/caffe/CMakeLists.txt
@@ -3,12 +3,12 @@ file(GLOB proto_files proto/*.proto)
 caffe_protobuf_generate_cpp_py(${proto_gen_folder} proto_srcs proto_hdrs proto_python ${proto_files})
 
 # include python files either to force generation
-add_library(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
-caffe_default_properties(proto)
-target_link_libraries(proto PUBLIC ${PROTOBUF_LIBRARIES})
-target_include_directories(proto PUBLIC ${PROTOBUF_INCLUDE_DIR})
+add_library(caffeproto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
+caffe_default_properties(caffeproto)
+target_link_libraries(caffeproto PUBLIC ${PROTOBUF_LIBRARIES})
+target_include_directories(caffeproto PUBLIC ${PROTOBUF_INCLUDE_DIR})
 
-list(INSERT Caffe_LINKER_LIBS 0 PUBLIC proto) # note, crucial to prepend!
+list(INSERT Caffe_LINKER_LIBS 0 PUBLIC caffeproto) # note, crucial to prepend!
 
 # --[ Caffe library
 
@@ -42,7 +42,7 @@ set_target_properties(caffe PROPERTIES
 # ---[ Install
 install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 install(FILES ${proto_hdrs} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/caffe/proto)
-install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION ${CMAKE_INSTALL_LIBDIR})
+install(TARGETS caffe caffeproto EXPORT CaffeTargets DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
 file(WRITE ${PROJECT_BINARY_DIR}/__init__.py)
 list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py)

From 83814da36d5a44039ddc35f58f9b341e9d1bd935 Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Mon, 15 May 2017 03:04:47 +0000
Subject: [PATCH 072/144] docs/debian guide: update compiler combination table

---
 docs/install_apt_debian.md | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/docs/install_apt_debian.md b/docs/install_apt_debian.md
index 65fe70924e1..bd91124a898 100644
--- a/docs/install_apt_debian.md
+++ b/docs/install_apt_debian.md
@@ -96,18 +96,22 @@ Note, this requires a `deb-src` entry in your `/etc/apt/sources.list`.
 Some users may find their favorate compiler doesn't work with CUDA.
 
 ```
-CXX compiler |  CUDA 7.5  |  CUDA 8.0  |
--------------+------------+------------+-
-GCC-7        |     ?      |     ?      |
-GCC-6        |     ✘      |     ✘      |
-GCC-5        |     ✔ [1]  |     ✔      |
-CLANG-4.0    |     ?      |     ?      |
-CLANG-3.9    |     ✘      |     ✘      |
-CLANG-3.8    |     ?      |     ✔      |
+CXX compiler |  CUDA 7.5  |  CUDA 8.0  |  CUDA 9.0  |
+-------------+------------+------------+------------+
+GCC-8        |     ?      |     ?      |     ?      |
+GCC-7        |     ?      |     ?      |     ?      |
+GCC-6        |     ✘      |     ✘      |     ✔      |
+GCC-5        |     ✔ [1]  |     ✔      |     ✔      |
+-------------+------------+------------+------------+
+CLANG-4.0    |     ?      |     ?      |     ?      |
+CLANG-3.9    |     ✘      |     ✘      |     ✔      |
+CLANG-3.8    |     ?      |     ✔      |     ✔      |
 ```
 
 `[1]` CUDA 7.5 's `host_config.h` must be patched before working with GCC-5.
 
+`[2]` CUDA 9.0: https://devblogs.nvidia.com/parallelforall/cuda-9-features-revealed/
+
 BTW, please forget the GCC-4.X series, since its `libstdc++` ABI is not compatible with GCC-5's.
 You may encounter failure linking GCC-4.X object files against GCC-5 libraries.
 (See https://wiki.debian.org/GCC5 )

From 264cf199e4e8bc44bb97762b1018137704157c2c Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Tue, 13 Jun 2017 11:59:26 -0700
Subject: [PATCH 073/144] List branches in readme

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index 0ae3616b4a6..c40aee65c3c 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,14 @@ Check out the [project site](http://caffe.berkeleyvision.org) for all the detail
 
 and step-by-step examples.
 
+## Custom distributions
+
+- [Intel optimized branch](https://github.com/BVLC/caffe/tree/intel) for CPU, in particular Xeon processors (HSW, BDW, Xeon Phi).
+- [OpenCL Caffe](https://github.com/BVLC/caffe/tree/opencl) e.g. for AMD or Intel devices.
+- [Windows Caffe](https://github.com/BVLC/caffe/tree/windows)
+
+## Community
+
 [![Join the chat at https://gitter.im/BVLC/caffe](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/BVLC/caffe?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 
 Please join the [caffe-users group](https://groups.google.com/forum/#!forum/caffe-users) or [gitter chat](https://gitter.im/BVLC/caffe) to ask questions and talk about methods and models.

From 4efdf7ee49cffefdd7ea099c00dc5ea327640f04 Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Tue, 20 Jun 2017 14:20:42 -0700
Subject: [PATCH 074/144] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c40aee65c3c..5148c69d310 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ and step-by-step examples.
 
 ## Custom distributions
 
-- [Intel optimized branch](https://github.com/BVLC/caffe/tree/intel) for CPU, in particular Xeon processors (HSW, BDW, Xeon Phi).
+ - [Intel Caffe](https://github.com/BVLC/caffe/tree/intel) (Optimized for CPU and support for multi-node), in particular Xeon processors (HSW, BDW, Xeon Phi).
 - [OpenCL Caffe](https://github.com/BVLC/caffe/tree/opencl) e.g. for AMD or Intel devices.
 - [Windows Caffe](https://github.com/BVLC/caffe/tree/windows)
 

From c209c92fcd5d7097a75dfa17a022ec6b69d028db Mon Sep 17 00:00:00 2001
From: Arne Suppe <suppe@andrew.cmu.edu>
Date: Wed, 21 Jun 2017 12:19:30 +0800
Subject: [PATCH 075/144] Fixed bug where make distribute duplicates python
 files in distribute/python

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 4d324160c08..c6d5685b140 100644
--- a/Makefile
+++ b/Makefile
@@ -694,6 +694,6 @@ $(DISTRIBUTE_DIR): all py | $(DISTRIBUTE_SUBDIRS)
 	install -m 644 $(DYNAMIC_NAME) $(DISTRIBUTE_DIR)/lib
 	cd $(DISTRIBUTE_DIR)/lib; rm -f $(DYNAMIC_NAME_SHORT);   ln -s $(DYNAMIC_VERSIONED_NAME_SHORT) $(DYNAMIC_NAME_SHORT)
 	# add python - it's not the standard way, indeed...
-	cp -r python $(DISTRIBUTE_DIR)/python
+	cp -r python $(DISTRIBUTE_DIR)/
 
 -include $(DEPS)

From 4a2f2a0f2cbfca5fe8d9fec417a432d0aa345f37 Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Sat, 8 Jul 2017 08:22:07 +0000
Subject: [PATCH 076/144] docs: update apt installation guide for Debian and
 Ubuntu

Caffe package is available for APT since Debian 9.0 and Ubuntu 17.04 .
---
 docs/install_apt.md        | 28 ++++++++++++++++++++++++++++
 docs/install_apt_debian.md | 30 ++++++++++++++----------------
 2 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/docs/install_apt.md b/docs/install_apt.md
index ee2cd287701..43785f56d27 100644
--- a/docs/install_apt.md
+++ b/docs/install_apt.md
@@ -4,6 +4,34 @@ title: "Installation: Ubuntu"
 
 # Ubuntu Installation
 
+### For Ubuntu (>= 17.04)
+
+**Installing pre-compiled Caffe**
+
+Everything including caffe itself is packaged in 17.04 and higher versions.
+To install pre-compiled Caffe package, just do it by
+
+    sudo apt install caffe-cpu
+
+for CPU-only version, or
+
+    sudo apt install caffe-cuda
+
+for CUDA version. Note, the cuda version may break if your NVIDIA driver
+and CUDA toolkit are not installed by APT.
+
+**Installing Caffe from source**
+
+We may install the dependencies by merely one line
+
+    sudo apt build-dep caffe-cpu        # dependencies for CPU-only version
+    sudo apt build-dep caffe-cuda       # dependencies for CUDA version
+
+It requires a `deb-src` line in your `sources.list`.
+Continue with [compilation](installation.html#compilation).
+
+### For Ubuntu (\< 17.04)
+
 **General dependencies**
 
     sudo apt-get install libprotobuf-dev libleveldb-dev libsnappy-dev libopencv-dev libhdf5-serial-dev protobuf-compiler
diff --git a/docs/install_apt_debian.md b/docs/install_apt_debian.md
index bd91124a898..0a6a3b962e5 100644
--- a/docs/install_apt_debian.md
+++ b/docs/install_apt_debian.md
@@ -8,24 +8,28 @@ Caffe packages are available for several Debian versions, as shown in the
 following chart:
 
 ```
-Your Distro     |  CPU_ONLY  |  CUDA  |     Alias
+Your Distro     |  CPU_ONLY  |  CUDA  | Codename
 ----------------+------------+--------+-------------------
-Debian/stable   |     ✘      |   ✘    | Debian Jessie
-Debian/testing  |     ✔      |   ✔    | Debian Stretch/Sid
-Debian/unstable |     ✔      |   ✔    | Debian Sid
+Debian/oldstable|     ✘      |   ✘    | Jessie (8.0)
+Debian/stable   |     ✔      |   ✔    | Stretch (9.0)
+Debian/testing  |     ✔      |   ✔    | Buster
+Debian/unstable |     ✔      |   ✔    | Buster
 ```
 
 * `✘ ` You should take a look at [Ubuntu installation instruction](install_apt.html).
 
 * `✔ ` You can install caffe with a single command line following this guide.
 
-Last update: 2017-02-01
+* [Package status of CPU-only version](https://tracker.debian.org/pkg/caffe)
+
+* [Package status of CUDA version](https://tracker.debian.org/pkg/caffe-contrib)
+
+Last update: 2017-07-08
 
 ## Binary installation with APT
 
-Apart from the installation methods based on source, Debian/unstable
-and Debian/testing users can install pre-compiled Caffe packages from
-the official archive.
+Apart from the installation methods based on source, Debian users can install
+pre-compiled Caffe packages from the official archive with APT.
 
 Make sure that your `/etc/apt/sources.list` contains `contrib` and `non-free`
 sections if you want to install the CUDA version, for instance:
@@ -44,7 +48,8 @@ $ caffe                                              # command line interface wo
 $ python3 -c 'import caffe; print(caffe.__path__)'   # python3 interface working
 ```
 
-These Caffe packages should work for you out of box.
+These Caffe packages should work for you out of box. However, the CUDA version
+may break if your NVIDIA driver and CUDA toolkit are not installed with APT.
 
 #### Customizing caffe packages
 
@@ -156,10 +161,3 @@ and hack the packaging scripts, then build your customized package.
 $ sudo apt install caffe-doc
 $ dpkg -L caffe-doc
 ```
-
-* Where can I find the Debian package status?
-
-```
-https://tracker.debian.org/pkg/caffe          (for the CPU_ONLY version)
-https://tracker.debian.org/pkg/caffe-contrib  (for the CUDA version)
-```

From eedf7c188708e097e63984528c94d3c8616dd5ff Mon Sep 17 00:00:00 2001
From: Zhou Mo <cdluminate@gmail.com>
Date: Sat, 8 Jul 2017 08:24:57 +0000
Subject: [PATCH 077/144] docs: add Ubuntu package tracker link in Ubuntu guide

---
 docs/install_apt.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/install_apt.md b/docs/install_apt.md
index 43785f56d27..b6cb1c2d6f7 100644
--- a/docs/install_apt.md
+++ b/docs/install_apt.md
@@ -20,6 +20,10 @@ for CPU-only version, or
 for CUDA version. Note, the cuda version may break if your NVIDIA driver
 and CUDA toolkit are not installed by APT.
 
+[Package status of CPU-only version](https://launchpad.net/ubuntu/+source/caffe)
+
+[Package status of CUDA version](https://launchpad.net/ubuntu/+source/caffe-contrib)
+
 **Installing Caffe from source**
 
 We may install the dependencies by merely one line

From 3d7cfc40c170f93ac88909f40ca0208269ee26a9 Mon Sep 17 00:00:00 2001
From: Lydorn <nicolas.jp.girard@gmail.com>
Date: Mon, 10 Jul 2017 15:43:47 +0200
Subject: [PATCH 078/144] Update lrn.md

Fixed typo "locaitons " -> "location" in line 17
---
 docs/tutorial/layers/lrn.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tutorial/layers/lrn.md b/docs/tutorial/layers/lrn.md
index 2fbef734663..f5e4829279d 100644
--- a/docs/tutorial/layers/lrn.md
+++ b/docs/tutorial/layers/lrn.md
@@ -14,7 +14,7 @@ title: Local Response Normalization (LRN)
         - `local_size` [default 5]: the number of channels to sum over (for cross channel LRN) or the side length of the square region to sum over (for within channel LRN)
         - `alpha` [default 1]: the scaling parameter (see below)
         - `beta` [default 5]: the exponent (see below)
-        - `norm_region` [default `ACROSS_CHANNELS`]: whether to sum over adjacent channels (`ACROSS_CHANNELS`) or nearby spatial locaitons (`WITHIN_CHANNEL`)
+        - `norm_region` [default `ACROSS_CHANNELS`]: whether to sum over adjacent channels (`ACROSS_CHANNELS`) or nearby spatial locations (`WITHIN_CHANNEL`)
 
 The local response normalization layer performs a kind of "lateral inhibition" by normalizing over local input regions. In `ACROSS_CHANNELS` mode, the local regions extend across nearby channels, but have no spatial extent (i.e., they have shape `local_size x 1 x 1`). In `WITHIN_CHANNEL` mode, the local regions extend spatially, but are in separate channels (i.e., they have shape `1 x local_size x local_size`). Each input value is divided by $$(1 + (\alpha/n) \sum_i x_i^2)^\beta$$, where $$n$$ is the size of each local region, and the sum is taken over the region centered at that value (zero padding is added where necessary).
 

From af9d6bc0446f479a7c7cff870de4da5df19fddd1 Mon Sep 17 00:00:00 2001
From: downes <downes>
Date: Wed, 12 Jul 2017 14:50:40 -0700
Subject: [PATCH 079/144] update sklearn calls to use latest API

Version 0.18 moved cross-validation to sklearn.model_selection - see http://scikit-learn.org/stable/whats_new.html#version-0-18
Version 0.17 deprecated class_weight="auto" in favor of class_weight="balanced"
---
 examples/brewing-logreg.ipynb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/brewing-logreg.ipynb b/examples/brewing-logreg.ipynb
index c053b73b39f..4b4cd6a3f02 100644
--- a/examples/brewing-logreg.ipynb
+++ b/examples/brewing-logreg.ipynb
@@ -73,7 +73,7 @@
     ")\n",
     "\n",
     "# Split into train and test\n",
-    "X, Xt, y, yt = sklearn.cross_validation.train_test_split(X, y)\n",
+    "X, Xt, y, yt = sklearn.model_selection.train_test_split(X, y)\n",
     "\n",
     "# Visualize sample of the data\n",
     "ind = np.random.permutation(X.shape[0])[:1000]\n",
@@ -111,7 +111,7 @@
     "%%timeit\n",
     "# Train and test the scikit-learn SGD logistic regression.\n",
     "clf = sklearn.linear_model.SGDClassifier(\n",
-    "    loss='log', n_iter=1000, penalty='l2', alpha=5e-4, class_weight='auto')\n",
+    "    loss='log', n_iter=1000, penalty='l2', alpha=5e-4, class_weight='balanced')\n",
     "\n",
     "clf.fit(X, y)\n",
     "yt_pred = clf.predict(Xt)\n",

From 4b98f06c03c7cb84163ba7f681dbe9185fdcc5f9 Mon Sep 17 00:00:00 2001
From: downes <downes>
Date: Wed, 12 Jul 2017 14:52:53 -0700
Subject: [PATCH 080/144] update deprecated pandas call

pd.scatter_matrix -> pd.plotting.scatter_matrix
---
 examples/brewing-logreg.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/brewing-logreg.ipynb b/examples/brewing-logreg.ipynb
index 4b4cd6a3f02..0f87185a35b 100644
--- a/examples/brewing-logreg.ipynb
+++ b/examples/brewing-logreg.ipynb
@@ -78,7 +78,7 @@
     "# Visualize sample of the data\n",
     "ind = np.random.permutation(X.shape[0])[:1000]\n",
     "df = pd.DataFrame(X[ind])\n",
-    "_ = pd.scatter_matrix(df, figsize=(9, 9), diagonal='kde', marker='o', s=40, alpha=.4, c=y[ind])"
+    "_ = pd.plotting.scatter_matrix(df, figsize=(9, 9), diagonal='kde', marker='o', s=40, alpha=.4, c=y[ind])"
    ]
   },
   {

From 751a570a3674e2d77e157a7e365fe68f05505157 Mon Sep 17 00:00:00 2001
From: Jean-Louis Queguiner <jean-louis.queguiner@gadz.org>
Date: Tue, 1 Aug 2017 11:16:29 +0200
Subject: [PATCH 081/144] [DOC][FIX] fix web demo install instruction link

---
 examples/web_demo/readme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/web_demo/readme.md b/examples/web_demo/readme.md
index fe74b9ef7d3..e50c4f101ee 100644
--- a/examples/web_demo/readme.md
+++ b/examples/web_demo/readme.md
@@ -11,7 +11,7 @@ priority: 10
 ## Requirements
 
 The demo server requires Python with some dependencies.
-To make sure you have the dependencies, please run `pip install -r examples/web_demo/requirements.txt`, and also make sure that you've compiled the Python Caffe interface and that it is on your `PYTHONPATH` (see [installation instructions](/installation.html)).
+To make sure you have the dependencies, please run `pip install -r examples/web_demo/requirements.txt`, and also make sure that you've compiled the Python Caffe interface and that it is on your `PYTHONPATH` (see [installation instructions](http://caffe.berkeleyvision.org/installation.html)).
 
 Make sure that you have obtained the Reference CaffeNet Model and the ImageNet Auxiliary Data:
 

From 315641b7ef8624b756ed042a7e9330ecde3782e7 Mon Sep 17 00:00:00 2001
From: Keith Mok <ek9852@gmail.com>
Date: Fri, 4 Aug 2017 13:55:34 -0700
Subject: [PATCH 082/144] Fix hardcode xcode path

User may not install xcoder into default directory
especially if there are two different versions of xcoder installed.
---
 cmake/Modules/FindvecLib.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/Modules/FindvecLib.cmake b/cmake/Modules/FindvecLib.cmake
index 8eaab59473c..4d44e613a00 100644
--- a/cmake/Modules/FindvecLib.cmake
+++ b/cmake/Modules/FindvecLib.cmake
@@ -12,11 +12,12 @@ endif()
 
 set(__veclib_include_suffix "Frameworks/vecLib.framework/Versions/Current/Headers")
 
+exec_program(xcode-select ARGS -print-path OUTPUT_VARIABLE CMAKE_XCODE_DEVELOPER_DIR)
 find_path(vecLib_INCLUDE_DIR vecLib.h
           DOC "vecLib include directory"
           PATHS /System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}
                 /System/Library/${__veclib_include_suffix}
-                /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
+                ${CMAKE_XCODE_DEVELOPER_DIR}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
           NO_DEFAULT_PATH)
 
 include(FindPackageHandleStandardArgs)

From 1de4cebfb81d50267d0d8c2595372b14e1408248 Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Wed, 16 Aug 2017 18:24:32 -0700
Subject: [PATCH 083/144] Update README.md

Mention SKX support
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5148c69d310..fe259535865 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ and step-by-step examples.
 
 ## Custom distributions
 
- - [Intel Caffe](https://github.com/BVLC/caffe/tree/intel) (Optimized for CPU and support for multi-node), in particular Xeon processors (HSW, BDW, Xeon Phi).
+ - [Intel Caffe](https://github.com/BVLC/caffe/tree/intel) (Optimized for CPU and support for multi-node), in particular Xeon processors (HSW, BDW, SKX, Xeon Phi).
 - [OpenCL Caffe](https://github.com/BVLC/caffe/tree/opencl) e.g. for AMD or Intel devices.
 - [Windows Caffe](https://github.com/BVLC/caffe/tree/windows)
 

From 7b721207235732846666143070f4990707bcb66d Mon Sep 17 00:00:00 2001
From: Josh Bialkowski <josh@skydio.com>
Date: Mon, 21 Aug 2017 08:31:25 -0700
Subject: [PATCH 084/144] Fix caffe rpath

The logic for setting the library RPATH checks whether or not
${CMAKE_INSTALL_PREFIX}/lib is a system directory, and if not
adds it to the library RPATH. However, caffe does not install
to ${CMAKE_INSTALL_PREFIX}/lib, it installs to
${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} (from
GNUInstallDirs). CMAKE_INSTALL_LIBDIR may be something like
"lib/x86_64-linux-gnu"
---
 cmake/Misc.cmake | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cmake/Misc.cmake b/cmake/Misc.cmake
index 9dd2609b36a..fcb246472f0 100644
--- a/cmake/Misc.cmake
+++ b/cmake/Misc.cmake
@@ -32,9 +32,10 @@ endif()
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOLEAN "Use link paths for shared library rpath")
 set(CMAKE_MACOSX_RPATH TRUE)
 
-list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES ${CMAKE_INSTALL_PREFIX}/lib __is_systtem_dir)
+list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES
+     ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} __is_systtem_dir)
 if(${__is_systtem_dir} STREQUAL -1)
-  set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib)
+  set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR})
 endif()
 
 # ---[ Funny target

From 1f1326d046773d31a9f0916d8f5f8ccaa67bde46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=AD=A4=E9=97=B4?= <zhangyue.zhangyue@alibaba-inc.com>
Date: Tue, 22 Aug 2017 07:36:09 +0800
Subject: [PATCH 085/144] Update link to google style guide.

---
 docs/development.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/development.md b/docs/development.md
index ec05bbee102..36cd399512e 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -116,5 +116,5 @@ To get a list of all options `googletest` provides, simply pass the `--help` fla
 
 - **Run `make lint` to check C++ code.**
 - Wrap lines at 80 chars.
-- Follow [Google C++ style](http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml) and [Google python style](http://google-styleguide.googlecode.com/svn/trunk/pyguide.html) + [PEP 8](http://legacy.python.org/dev/peps/pep-0008/).
+- Follow [Google C++ style](https://google.github.io/styleguide/cppguide.html) and [Google python style](https://google.github.io/styleguide/pyguide.html) + [PEP 8](http://legacy.python.org/dev/peps/pep-0008/).
 - Remember that “a foolish consistency is the hobgoblin of little minds,” so use your best judgement to write the clearest code for your particular case.

From 3dad3323436e05a4c1890104b0f26f27f6d77d31 Mon Sep 17 00:00:00 2001
From: wasnot <wasnot.apps@gmail.com>
Date: Wed, 6 Sep 2017 12:32:39 +0900
Subject: [PATCH 086/144] modified division operator for compatibility of
 python 3

---
 python/caffe/classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/caffe/classifier.py b/python/caffe/classifier.py
index ea29fed86f9..983760a786d 100644
--- a/python/caffe/classifier.py
+++ b/python/caffe/classifier.py
@@ -92,7 +92,7 @@ def predict(self, inputs, oversample=True):
 
         # For oversampling, average predictions across crops.
         if oversample:
-            predictions = predictions.reshape((len(predictions) / 10, 10, -1))
+            predictions = predictions.reshape((len(predictions) // 10, 10, -1))
             predictions = predictions.mean(1)
 
         return predictions

From 8bdc87f6bc6a7d05d2fdbee2cfc159003297476b Mon Sep 17 00:00:00 2001
From: Takuya Narihira <dvbscb@gmail.com>
Date: Wed, 11 Mar 2015 16:23:39 -0700
Subject: [PATCH 087/144] Expose GPU pointers to Python

The pointers could be used by CUDA wrapper libraries in Python such as
PyCUDA, gnumpy, Theano etc.
---
 python/caffe/_caffe.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index d7f43fff62d..72659a4f44e 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -464,6 +464,14 @@ BOOST_PYTHON_MODULE(_caffe) {
     .add_property("count",    static_cast<int (Blob<Dtype>::*)() const>(
         &Blob<Dtype>::count))
     .def("reshape",           bp::raw_function(&Blob_Reshape))
+#ifndef CPU_ONLY
+    .add_property("_gpu_data_ptr",
+        reinterpret_cast<uintptr_t (Blob<Dtype>::*)()>(
+          &Blob<Dtype>::mutable_gpu_data))
+    .add_property("_gpu_diff_ptr",
+        reinterpret_cast<uintptr_t (Blob<Dtype>::*)()>(
+          &Blob<Dtype>::mutable_gpu_diff))
+#endif
     .add_property("data",     bp::make_function(&Blob<Dtype>::mutable_cpu_data,
           NdarrayCallPolicies()))
     .add_property("diff",     bp::make_function(&Blob<Dtype>::mutable_cpu_diff,

From d2625f86ac9304467276cf69a7eda0c93f5e242e Mon Sep 17 00:00:00 2001
From: Bo Wang <david.b.wang@gmail.com>
Date: Sat, 9 Sep 2017 21:57:20 -0700
Subject: [PATCH 088/144] Implement CuDNN-based deconvolution layer and test

---
 include/caffe/layers/cudnn_deconv_layer.hpp |  68 ++++
 src/caffe/layer_factory.cpp                 |  41 +++
 src/caffe/layers/cudnn_deconv_layer.cpp     | 327 ++++++++++++++++++++
 src/caffe/layers/cudnn_deconv_layer.cu      | 138 +++++++++
 src/caffe/layers/deconv_layer.cpp           |   1 -
 src/caffe/test/test_deconvolution_layer.cpp | 265 ++++++++++++++++
 6 files changed, 839 insertions(+), 1 deletion(-)
 create mode 100644 include/caffe/layers/cudnn_deconv_layer.hpp
 create mode 100644 src/caffe/layers/cudnn_deconv_layer.cpp
 create mode 100644 src/caffe/layers/cudnn_deconv_layer.cu

diff --git a/include/caffe/layers/cudnn_deconv_layer.hpp b/include/caffe/layers/cudnn_deconv_layer.hpp
new file mode 100644
index 00000000000..34095e5c44d
--- /dev/null
+++ b/include/caffe/layers/cudnn_deconv_layer.hpp
@@ -0,0 +1,68 @@
+#ifndef CAFFE_CUDNN_DECONV_LAYER_HPP_
+#define CAFFE_CUDNN_DECONV_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/deconv_layer.hpp"
+
+namespace caffe {
+
+#ifdef USE_CUDNN
+/*
+ * @brief cuDNN implementation of DeConvolutionLayer.
+ *        Fallback to DeConvolutionLayer for CPU mode.
+ *
+ * cuDNN accelerates deconvolution through forward kernels for filtering and
+ * bias plus backward kernels for the gradient w.r.t. the filters, biases, and
+ * inputs. Caffe + cuDNN further speeds up the computation through forward
+ * parallelism across groups and backward parallelism across gradients.
+*/
+template <typename Dtype>
+class CuDNNDeconvolutionLayer : public DeconvolutionLayer<Dtype> {
+public:
+  explicit CuDNNDeconvolutionLayer(const LayerParameter& param)
+    : DeconvolutionLayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+                          const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+                       const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNDeconvolutionLayer();
+
+protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+                           const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+                            const vector<bool>& propagate_down,
+                            const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t* handle_;
+  cudaStream_t*  stream_;
+
+  // algorithms for forward and backwards convolutions
+  cudnnConvolutionFwdAlgo_t *fwd_algo_;
+  cudnnConvolutionBwdFilterAlgo_t *bwd_filter_algo_;
+  cudnnConvolutionBwdDataAlgo_t *bwd_data_algo_;
+
+  vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
+  cudnnTensorDescriptor_t bias_desc_;
+  cudnnFilterDescriptor_t filter_desc_;
+  vector<cudnnConvolutionDescriptor_t> conv_descs_;
+  int bottom_offset_, top_offset_, bias_offset_;
+
+  size_t *workspace_fwd_sizes_;
+  size_t *workspace_bwd_data_sizes_;
+  size_t *workspace_bwd_filter_sizes_;
+  size_t workspaceSizeInBytes;  // size of underlying storage
+  void *workspaceData;  // underlying storage
+  void **workspace;  // aliases into workspaceData
+};
+#endif
+
+}  // namespace caffe
+
+#endif // CAFFE_CUDNN_DECONV_LAYER_HPP_
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index f14253a510e..9f9026b1dde 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -8,6 +8,7 @@
 #include "caffe/layer.hpp"
 #include "caffe/layer_factory.hpp"
 #include "caffe/layers/conv_layer.hpp"
+#include "caffe/layers/deconv_layer.hpp"
 #include "caffe/layers/lrn_layer.hpp"
 #include "caffe/layers/pooling_layer.hpp"
 #include "caffe/layers/relu_layer.hpp"
@@ -18,6 +19,7 @@
 
 #ifdef USE_CUDNN
 #include "caffe/layers/cudnn_conv_layer.hpp"
+#include "caffe/layers/cudnn_deconv_layer.hpp"
 #include "caffe/layers/cudnn_lcn_layer.hpp"
 #include "caffe/layers/cudnn_lrn_layer.hpp"
 #include "caffe/layers/cudnn_pooling_layer.hpp"
@@ -73,6 +75,45 @@ shared_ptr<Layer<Dtype> > GetConvolutionLayer(
 
 REGISTER_LAYER_CREATOR(Convolution, GetConvolutionLayer);
 
+// Get deconvolution layer according to engine.
+template <typename Dtype>
+shared_ptr<Layer<Dtype> > GetDeconvolutionLayer(const LayerParameter& param) {
+  ConvolutionParameter conv_param = param.convolution_param();
+  ConvolutionParameter_Engine engine = conv_param.engine();
+#ifdef USE_CUDNN
+  bool use_dilation = false;
+  for (int i = 0; i < conv_param.dilation_size(); ++i) {
+    if (conv_param.dilation(i) > 1) {
+      use_dilation = true;
+    }
+  }
+#endif
+  if (engine == ConvolutionParameter_Engine_DEFAULT) {
+    engine = ConvolutionParameter_Engine_CAFFE;
+#ifdef USE_CUDNN
+    if (!use_dilation) {
+      engine = ConvolutionParameter_Engine_CUDNN;
+    }
+#endif
+  }
+  if (engine == ConvolutionParameter_Engine_CAFFE) {
+    return shared_ptr<Layer<Dtype> >(new DeconvolutionLayer<Dtype>(param));
+#ifdef USE_CUDNN
+  } else if (engine == ConvolutionParameter_Engine_CUDNN) {
+    if (use_dilation) {
+      LOG(FATAL) << "CuDNN doesn't support the dilated deconvolution at Layer "
+                 << param.name();
+    }
+    return shared_ptr<Layer<Dtype> >(new CuDNNDeconvolutionLayer<Dtype>(param));
+#endif
+  } else {
+    LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+    throw;  // Avoids missing return warning
+  }
+}
+
+REGISTER_LAYER_CREATOR(Deconvolution, GetDeconvolutionLayer);
+
 // Get pooling layer according to engine.
 template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
diff --git a/src/caffe/layers/cudnn_deconv_layer.cpp b/src/caffe/layers/cudnn_deconv_layer.cpp
new file mode 100644
index 00000000000..260da5c1ee0
--- /dev/null
+++ b/src/caffe/layers/cudnn_deconv_layer.cpp
@@ -0,0 +1,327 @@
+#ifdef USE_CUDNN
+#include <algorithm>
+#include <vector>
+
+#include "caffe/layers/cudnn_deconv_layer.hpp"
+
+namespace caffe {
+
+// Set to three for the benefit of the backward pass, which
+// can use separate streams for calculating the gradient w.r.t.
+// bias, filter weights, and bottom data for each group independently
+#define CUDNN_STREAMS_PER_GROUP 3
+
+/**
+ * TODO(dox) explain cuDNN interface
+ */
+template <typename Dtype>
+void CuDNNDeconvolutionLayer<Dtype>::LayerSetUp(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  DeconvolutionLayer<Dtype>::LayerSetUp(bottom, top);
+  // Initialize CUDA streams and cuDNN.
+  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
+  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
+
+  // Initialize algorithm arrays
+  fwd_algo_       = new cudnnConvolutionFwdAlgo_t[bottom.size()];
+  bwd_filter_algo_= new cudnnConvolutionBwdFilterAlgo_t[bottom.size()];
+  bwd_data_algo_  = new cudnnConvolutionBwdDataAlgo_t[bottom.size()];
+
+  // initialize size arrays
+  workspace_fwd_sizes_ = new size_t[bottom.size()];
+  workspace_bwd_filter_sizes_ = new size_t[bottom.size()];
+  workspace_bwd_data_sizes_ = new size_t[bottom.size()];
+
+  // workspace data
+  workspaceSizeInBytes = 0;
+  workspaceData = NULL;
+  workspace = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP];
+
+  for (size_t i = 0; i < bottom.size(); ++i) {
+    // initialize all to default algorithms
+    fwd_algo_[i] = (cudnnConvolutionFwdAlgo_t)0;
+    bwd_filter_algo_[i] = (cudnnConvolutionBwdFilterAlgo_t)0;
+    bwd_data_algo_[i] = (cudnnConvolutionBwdDataAlgo_t)0;
+    // default algorithms don't require workspace
+    workspace_fwd_sizes_[i] = 0;
+    workspace_bwd_data_sizes_[i] = 0;
+    workspace_bwd_filter_sizes_[i] = 0;
+  }
+
+  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
+    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
+    CUDNN_CHECK(cudnnCreate(&handle_[g]));
+    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
+    workspace[g] = NULL;
+  }
+
+  // Set the indexing parameters.
+  bias_offset_ = (this->num_output_ / this->group_);
+
+  // Create filter descriptor.
+  const int* kernel_shape_data = this->kernel_shape_.cpu_data();
+  const int kernel_h = kernel_shape_data[0];
+  const int kernel_w = kernel_shape_data[1];
+  cudnn::createFilterDesc<Dtype>(&filter_desc_,
+                                 this->channels_ / this->group_,
+                                 this->num_output_ / this->group_,
+                                 kernel_h,
+                                 kernel_w);
+
+  // Create tensor descriptor(s) for data and corresponding convolution(s).
+  for (int i = 0; i < bottom.size(); i++) {
+    cudnnTensorDescriptor_t bottom_desc;
+    cudnn::createTensor4dDesc<Dtype>(&bottom_desc);
+    bottom_descs_.push_back(bottom_desc);
+    cudnnTensorDescriptor_t top_desc;
+    cudnn::createTensor4dDesc<Dtype>(&top_desc);
+    top_descs_.push_back(top_desc);
+    cudnnConvolutionDescriptor_t conv_desc;
+    cudnn::createConvolutionDesc<Dtype>(&conv_desc);
+    conv_descs_.push_back(conv_desc);
+  }
+
+  // Tensor descriptor for bias.
+  if (this->bias_term_) {
+    cudnn::createTensor4dDesc<Dtype>(&bias_desc_);
+  }
+
+  handles_setup_ = true;
+}
+
+template <typename Dtype>
+void CuDNNDeconvolutionLayer<Dtype>::Reshape(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  DeconvolutionLayer<Dtype>::Reshape(bottom, top);
+  CHECK_EQ(2, this->num_spatial_axes_)
+      << "CuDNNDeconvolutionLayer input must have 2 spatial axes "
+      << "(e.g., height and width). "
+      << "Use 'engine: CAFFE' for general ND convolution.";
+  bottom_offset_ = this->bottom_dim_ / this->group_;
+  top_offset_ = this->top_dim_ / this->group_;
+  const int height = bottom[0]->shape(this->channel_axis_ + 1);
+  const int width = bottom[0]->shape(this->channel_axis_ + 2);
+  const int height_out = top[0]->shape(this->channel_axis_ + 1);
+  const int width_out = top[0]->shape(this->channel_axis_ + 2);
+  const int* pad_data = this->pad_.cpu_data();
+  const int pad_h = pad_data[0];
+  const int pad_w = pad_data[1];
+  const int* stride_data = this->stride_.cpu_data();
+  const int stride_h = stride_data[0];
+  const int stride_w = stride_data[1];
+
+  // Specify workspace limit for kernels directly until we have a
+  // planning strategy and a rewrite of Caffe's GPU memory mangagement
+  size_t workspace_limit_bytes = 8*1024*1024;
+
+  for (int i = 0; i < bottom.size(); i++) {
+    cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
+                                  this->num_,
+                                  this->channels_ / this->group_,
+                                  height,
+                                  width,
+                                  this->channels_ * height * width,
+                                  height * width,
+                                  width,
+                                  1);
+    cudnn::setTensor4dDesc<Dtype>(&top_descs_[i],
+                                  this->num_,
+                                  this->num_output_ / this->group_,
+                                  height_out,
+                                  width_out,
+                                  this->num_output_ * height_out * width_out,
+                                  height_out * width_out,
+                                  width_out,
+                                  1);
+    cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i],
+                                     top_descs_[i],
+                                     filter_desc_,
+                                     pad_h,
+                                     pad_w,
+                                     stride_h,
+                                     stride_w);
+
+    // choose forward and backward algorithms + workspace(s)
+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+        handle_[0],
+        top_descs_[i],
+        filter_desc_,
+        conv_descs_[i],
+        bottom_descs_[i],
+        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        workspace_limit_bytes,
+        &fwd_algo_[i]));
+
+    // We have found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is
+    // buggy. Thus, if this algo was chosen, choose winograd instead. If
+    // winograd is not supported or workspace is larger than threshold, choose
+    // implicit_gemm instead.
+    if (fwd_algo_[i] == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) {
+      size_t winograd_workspace_size;
+      cudnnStatus_t status = cudnnGetConvolutionForwardWorkspaceSize(
+          handle_[0],
+          top_descs_[i],
+          filter_desc_,
+          conv_descs_[i],
+          bottom_descs_[i],
+          CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
+          &winograd_workspace_size);
+      if (status != CUDNN_STATUS_SUCCESS ||
+          winograd_workspace_size >= workspace_limit_bytes) {
+        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+      } else {
+        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
+      }
+    }
+
+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+        handle_[0],
+        top_descs_[i],
+        filter_desc_,
+        conv_descs_[i],
+        bottom_descs_[i],
+        fwd_algo_[i],
+        &(workspace_fwd_sizes_[i])));
+
+    // choose backward algorithm for filter
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+        handle_[0],
+        top_descs_[i],
+        bottom_descs_[i],
+        conv_descs_[i],
+        filter_desc_,
+        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+        workspace_limit_bytes,
+        &bwd_filter_algo_[i]));
+
+    // get workspace for backwards filter algorithm
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+        handle_[0],
+        top_descs_[i],
+        bottom_descs_[i],
+        conv_descs_[i],
+        filter_desc_,
+        bwd_filter_algo_[i],
+        &workspace_bwd_filter_sizes_[i]));
+
+    // choose backward algo for data
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+        handle_[0],
+        filter_desc_,
+        bottom_descs_[i],
+        conv_descs_[i],
+        top_descs_[i],
+        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+        workspace_limit_bytes,
+        &bwd_data_algo_[i]));
+
+    // get workspace size
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+        handle_[0],
+        filter_desc_,
+        bottom_descs_[i],
+        conv_descs_[i],
+        top_descs_[i],
+        bwd_data_algo_[i],
+        &workspace_bwd_data_sizes_[i]));
+  }
+
+  // reduce over all workspace sizes to get a maximum to allocate / reallocate
+  size_t total_workspace_fwd = 0;
+  size_t total_workspace_bwd_data = 0;
+  size_t total_workspace_bwd_filter = 0;
+
+  for (size_t i = 0; i < bottom.size(); i++) {
+    total_workspace_fwd        = std::max(total_workspace_fwd,
+                                     workspace_fwd_sizes_[i]);
+    total_workspace_bwd_data   = std::max(total_workspace_bwd_data,
+                                     workspace_bwd_data_sizes_[i]);
+    total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
+                                     workspace_bwd_filter_sizes_[i]);
+  }
+  // get max over all operations
+  size_t max_workspace = std::max(total_workspace_fwd,
+                             total_workspace_bwd_data);
+  max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
+  // ensure all groups have enough workspace
+  size_t total_max_workspace = max_workspace *
+                               (this->group_ * CUDNN_STREAMS_PER_GROUP);
+
+  // this is the total amount of storage needed over all groups + streams
+  if (total_max_workspace > workspaceSizeInBytes) {
+    DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
+    workspaceSizeInBytes = total_max_workspace;
+
+    // free the existing workspace and allocate a new (larger) one
+    cudaFree(this->workspaceData);
+
+    cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
+    if (err != cudaSuccess) {
+      // force zero memory path
+      for (int i = 0; i < bottom.size(); i++) {
+        workspace_fwd_sizes_[i] = 0;
+        workspace_bwd_filter_sizes_[i] = 0;
+        workspace_bwd_data_sizes_[i] = 0;
+        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING;
+        bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+        bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+      }
+
+      // NULL out all workspace pointers
+      for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
+        workspace[g] = NULL;
+      }
+      // NULL out underlying data
+      workspaceData = NULL;
+      workspaceSizeInBytes = 0;
+    }
+
+    // if we succeed in the allocation, set pointer aliases for workspaces
+    for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
+      workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace;
+    }
+  }
+
+  // Tensor descriptor for bias.
+  if (this->bias_term_) {
+    cudnn::setTensor4dDesc<Dtype>(
+        &bias_desc_, 1, this->num_output_ / this->group_, 1, 1);
+  }
+}
+
+template <typename Dtype>
+CuDNNDeconvolutionLayer<Dtype>::~CuDNNDeconvolutionLayer() {
+  // Check that handles have been setup before destroying.
+  if (!handles_setup_) { return; }
+
+  for (int i = 0; i < bottom_descs_.size(); i++) {
+    cudnnDestroyTensorDescriptor(bottom_descs_[i]);
+    cudnnDestroyTensorDescriptor(top_descs_[i]);
+    cudnnDestroyConvolutionDescriptor(conv_descs_[i]);
+  }
+  if (this->bias_term_) {
+    cudnnDestroyTensorDescriptor(bias_desc_);
+  }
+  cudnnDestroyFilterDescriptor(filter_desc_);
+
+  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
+    cudaStreamDestroy(stream_[g]);
+    cudnnDestroy(handle_[g]);
+  }
+
+  cudaFree(workspaceData);
+  delete [] workspace;
+  delete [] stream_;
+  delete [] handle_;
+  delete [] fwd_algo_;
+  delete [] bwd_filter_algo_;
+  delete [] bwd_data_algo_;
+  delete [] workspace_fwd_sizes_;
+  delete [] workspace_bwd_data_sizes_;
+  delete [] workspace_bwd_filter_sizes_;
+}
+
+INSTANTIATE_CLASS(CuDNNDeconvolutionLayer);
+
+}   // namespace caffe
+#endif
diff --git a/src/caffe/layers/cudnn_deconv_layer.cu b/src/caffe/layers/cudnn_deconv_layer.cu
new file mode 100644
index 00000000000..eb1df32918f
--- /dev/null
+++ b/src/caffe/layers/cudnn_deconv_layer.cu
@@ -0,0 +1,138 @@
+#ifdef USE_CUDNN
+#include <vector>
+
+#include "caffe/layers/cudnn_deconv_layer.hpp"
+
+namespace caffe {
+
+__global__ void sync_deconv_groups() {}
+
+template <typename Dtype>
+void CuDNNDeconvolutionLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+    Dtype* top_data = top[i]->mutable_gpu_data();
+
+    // Forward through cuDNN in parallel over groups.
+    for (int g = 0; g < this->group_; g++) {
+      // Filters.
+      CUDNN_CHECK(cudnnConvolutionBackwardData(
+          handle_[g],
+          cudnn::dataType<Dtype>::one,
+          filter_desc_,
+          weight + this->weight_offset_ * g,
+          bottom_descs_[i],
+          bottom_data + bottom_offset_ * g,
+          conv_descs_[i],
+          bwd_data_algo_[i],
+          workspace[g],
+          workspace_bwd_data_sizes_[i],
+          cudnn::dataType<Dtype>::zero,
+          top_descs_[i],
+          top_data + top_offset_ * g));
+
+      // Bias.
+      if (this->bias_term_) {
+        const Dtype* bias_data = this->blobs_[1]->gpu_data();
+        CUDNN_CHECK(cudnnAddTensor(handle_[g],
+                                   cudnn::dataType<Dtype>::one,
+                                   bias_desc_,
+                                   bias_data + bias_offset_ * g,
+                                   cudnn::dataType<Dtype>::one,
+                                   top_descs_[i],
+                                   top_data + top_offset_ * g));
+      }
+    }
+
+    // Synchronize the work across groups, each of which went into its own
+    // stream, by launching an empty kernel into the default (null) stream.
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    sync_deconv_groups<<<1, 1>>>();
+  }
+}
+
+template <typename Dtype>
+void CuDNNDeconvolutionLayer<Dtype>::Backward_gpu(
+    const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* weight = NULL;
+  Dtype* weight_diff = NULL;
+  if (this->param_propagate_down_[0]) {
+    weight = this->blobs_[0]->gpu_data();
+    weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  }
+  Dtype* bias_diff = NULL;
+  if (this->bias_term_ && this->param_propagate_down_[1]) {
+    bias_diff = this->blobs_[1]->mutable_gpu_diff();
+  }
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+    // Backward through cuDNN in parallel over groups and gradients.
+    for (int g = 0; g < this->group_; g++) {
+      // Gradient w.r.t. bias.
+      if (this->bias_term_ && this->param_propagate_down_[1]) {
+        CUDNN_CHECK(cudnnConvolutionBackwardBias(handle_[0 * this->group_ + g],
+                                                 cudnn::dataType<Dtype>::one,
+                                                 top_descs_[i],
+                                                 top_diff + top_offset_ * g,
+                                                 cudnn::dataType<Dtype>::one,
+                                                 bias_desc_,
+                                                 bias_diff + bias_offset_ * g));
+      }
+
+      // Gradient w.r.t. weights.
+      if (this->param_propagate_down_[0]) {
+        const Dtype* bottom_data = bottom[i]->gpu_data();
+        CUDNN_CHECK(cudnnConvolutionBackwardFilter(
+            handle_[1 * this->group_ + g],
+            cudnn::dataType<Dtype>::one,
+            top_descs_[i],
+            top_diff + top_offset_ * g,
+            bottom_descs_[i],
+            bottom_data + bottom_offset_ * g,
+            conv_descs_[i],
+            bwd_filter_algo_[i],
+            workspace[1 * this->group_ + g],
+            workspace_bwd_filter_sizes_[i],
+            cudnn::dataType<Dtype>::one,
+            filter_desc_,
+            weight_diff + this->weight_offset_ * g));
+      }
+
+      // Gradient w.r.t. bottom data.
+      if (propagate_down[i]) {
+        if (weight == NULL) {
+          weight = this->blobs_[0]->gpu_data();
+        }
+        Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+        CUDNN_CHECK(
+            cudnnConvolutionForward(handle_[2 * this->group_ + g],
+                                    cudnn::dataType<Dtype>::one,
+                                    top_descs_[i],
+                                    top_diff + top_offset_ * g,
+                                    filter_desc_,
+                                    weight + this->weight_offset_ * g,
+                                    conv_descs_[i],
+                                    fwd_algo_[i],
+                                    workspace[2 * this->group_ + g],
+                                    workspace_fwd_sizes_[i],
+                                    cudnn::dataType<Dtype>::zero,
+                                    bottom_descs_[i],
+                                    bottom_diff + bottom_offset_ * g));
+      }
+    }
+
+    // Synchronize the work across groups, each of which went into its own
+    // stream, by launching an empty kernel into the default (null) stream.
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    sync_deconv_groups<<<1, 1>>>();
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(CuDNNDeconvolutionLayer);
+
+}  // namespace caffe
+#endif
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index 20a460fbdea..b86472b34be 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -79,6 +79,5 @@ STUB_GPU(DeconvolutionLayer);
 #endif
 
 INSTANTIATE_CLASS(DeconvolutionLayer);
-REGISTER_LAYER_CLASS(Deconvolution);
 
 }  // namespace caffe
diff --git a/src/caffe/test/test_deconvolution_layer.cpp b/src/caffe/test/test_deconvolution_layer.cpp
index c4b09ad555a..0067907165f 100644
--- a/src/caffe/test/test_deconvolution_layer.cpp
+++ b/src/caffe/test/test_deconvolution_layer.cpp
@@ -6,6 +6,7 @@
 #include "caffe/common.hpp"
 #include "caffe/filler.hpp"
 #include "caffe/layers/deconv_layer.hpp"
+#include "caffe/layers/cudnn_deconv_layer.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
 #include "caffe/test/test_gradient_check_util.hpp"
@@ -301,4 +302,268 @@ TYPED_TEST(DeconvolutionLayerTest, TestGradient3D) {
       this->blob_top_vec_);
 }
 
+#ifdef USE_CUDNN
+
+// Since ConvolutionLayerTest checks the shared conv/deconv code in detail,
+// we'll just do a simple forward test and a gradient check.
+template <typename TypeParam>
+class CuDNNDeconvolutionLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  CuDNNDeconvolutionLayerTest()
+      : blob_bottom_(new Blob<Dtype>(2, 3, 6, 4)),
+        blob_bottom_2_(new Blob<Dtype>(2, 3, 6, 4)),
+        blob_top_(new Blob<Dtype>()),
+        blob_top_2_(new Blob<Dtype>()) {}
+  virtual void SetUp() {
+    // fill the values
+    FillerParameter filler_param;
+    filler_param.set_value(1.);
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    filler.Fill(this->blob_bottom_2_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+
+  virtual ~CuDNNDeconvolutionLayerTest() {
+    delete blob_bottom_;
+    delete blob_bottom_2_;
+    delete blob_top_;
+    delete blob_top_2_;
+  }
+
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_bottom_2_;
+  Blob<Dtype>* const blob_top_;
+  Blob<Dtype>* const blob_top_2_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(CuDNNDeconvolutionLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(CuDNNDeconvolutionLayerTest, TestSetup) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->set_num_output(4);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  shared_ptr<Layer<Dtype> > layer(
+      new CuDNNDeconvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 4);
+  EXPECT_EQ(this->blob_top_->height(), 13);
+  EXPECT_EQ(this->blob_top_->width(), 9);
+  EXPECT_EQ(this->blob_top_2_->num(), 2);
+  EXPECT_EQ(this->blob_top_2_->channels(), 4);
+  EXPECT_EQ(this->blob_top_2_->height(), 13);
+  EXPECT_EQ(this->blob_top_2_->width(), 9);
+  // setting group should not change the shape
+  convolution_param->set_num_output(3);
+  convolution_param->set_group(3);
+  layer.reset(new CuDNNDeconvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 3);
+  EXPECT_EQ(this->blob_top_->height(), 13);
+  EXPECT_EQ(this->blob_top_->width(), 9);
+  EXPECT_EQ(this->blob_top_2_->num(), 2);
+  EXPECT_EQ(this->blob_top_2_->channels(), 3);
+  EXPECT_EQ(this->blob_top_2_->height(), 13);
+  EXPECT_EQ(this->blob_top_2_->width(), 9);
+}
+
+TYPED_TEST(CuDNNDeconvolutionLayerTest, TestSimpleCuDNNDeconvolution) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->set_num_output(4);
+  convolution_param->mutable_weight_filler()->set_type("constant");
+  convolution_param->mutable_weight_filler()->set_value(1);
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<Dtype> > layer(
+      new CuDNNDeconvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  // constant-fill the bottom blobs
+  FillerParameter filler_param;
+  filler_param.set_value(1.);
+  ConstantFiller<Dtype> filler(filler_param);
+  filler.Fill(this->blob_bottom_);
+  filler.Fill(this->blob_bottom_2_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // simply check that accumulation works with overlapping filters
+  const Dtype* top_data = this->blob_top_->cpu_data();
+  for (int n = 0; n < this->blob_top_->num(); ++n) {
+    for (int c = 0; c < this->blob_top_->channels(); ++c) {
+      for (int h = 0; h < this->blob_top_->height(); ++h) {
+        for (int w = 0; w < this->blob_top_->width(); ++w) {
+          Dtype expected = 3.1;
+          bool h_overlap = h % 2 == 0 && h > 0
+            && h < this->blob_top_->height() - 1;
+          bool w_overlap = w % 2 == 0 && w > 0
+            && w < this->blob_top_->width() - 1;
+          if (h_overlap && w_overlap) {
+            expected += 9;
+          } else if (h_overlap || w_overlap) {
+            expected += 3;
+          }
+          EXPECT_NEAR(top_data[this->blob_top_->offset(n, c, h, w)],
+              expected, 1e-4);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(CuDNNDeconvolutionLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  convolution_param->add_kernel_size(2);
+  convolution_param->add_stride(1);
+  convolution_param->set_num_output(1);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  CuDNNDeconvolutionLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(CuDNNDeconvolutionLayerTest, TestNDAgainst2D) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kernel_h = 11;
+  const int kernel_w = 13;
+  vector<int> bottom_shape(4);
+  bottom_shape[0] = 15;
+  bottom_shape[1] = 12;
+  bottom_shape[2] = kernel_h * 2;
+  bottom_shape[3] = kernel_w * 2;
+  FillerParameter filler_param;
+  GaussianFiller<Dtype> filler(filler_param);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+    filler.Fill(this->blob_bottom_vec_[i]);
+  }
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->set_num_output(18);
+  convolution_param->set_bias_term(false);
+  convolution_param->set_group(6);
+  convolution_param->set_kernel_h(kernel_h);
+  convolution_param->set_kernel_w(kernel_w);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  Blob<Dtype> weights;
+  Blob<Dtype> top_diff;
+  // Shape and fill weights and top_diff.
+  bool copy_diff;
+  bool reshape;
+  {
+    CuDNNDeconvolutionLayer<Dtype> layer(layer_param);
+    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    top_diff.ReshapeLike(*this->blob_top_);
+    filler.Fill(&top_diff);
+    ASSERT_EQ(1, layer.blobs().size());
+    copy_diff = false; reshape = true;
+    weights.CopyFrom(*layer.blobs()[0], copy_diff, reshape);
+  }
+  vector<bool> propagate_down(1, true);
+  Blob<Dtype> result_2d;
+  Blob<Dtype> backward_result_2d;
+  Blob<Dtype> backward_weight_result_2d;
+  // Test with 2D im2col
+  {
+    caffe_set(this->blob_top_->count(), Dtype(0),
+              this->blob_top_->mutable_cpu_data());
+    caffe_set(this->blob_bottom_->count(), Dtype(0),
+              this->blob_bottom_->mutable_cpu_diff());
+    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
+    // Do SetUp and Forward; save Forward result in result_2d.
+    convolution_param->set_force_nd_im2col(false);
+    CuDNNDeconvolutionLayer<Dtype> layer_2d(layer_param);
+    layer_2d.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    ASSERT_EQ(1, layer_2d.blobs().size());
+    copy_diff = false; reshape = false;
+    layer_2d.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
+    layer_2d.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    copy_diff = false; reshape = true;
+    result_2d.CopyFrom(*this->blob_top_, copy_diff, reshape);
+    // Copy pre-generated top diff into actual top diff;
+    // do Backward and save result in backward_result_2d.
+    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
+    caffe_copy(top_diff.count(), top_diff.cpu_data(),
+               this->blob_top_->mutable_cpu_diff());
+    layer_2d.Backward(this->blob_top_vec_, propagate_down,
+                      this->blob_bottom_vec_);
+    copy_diff = true; reshape = true;
+    backward_result_2d.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
+    backward_weight_result_2d.CopyFrom(weights, copy_diff, reshape);
+  }
+  Blob<Dtype> result_nd;
+  Blob<Dtype> backward_result_nd;
+  Blob<Dtype> backward_weight_result_nd;
+  // Test with ND im2col
+  {
+    caffe_set(this->blob_top_->count(), Dtype(0),
+              this->blob_top_->mutable_cpu_data());
+    caffe_set(this->blob_bottom_->count(), Dtype(0),
+              this->blob_bottom_->mutable_cpu_diff());
+    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
+    // Do SetUp and Forward; save Forward result in result_nd.
+    convolution_param->set_force_nd_im2col(true);
+    CuDNNDeconvolutionLayer<Dtype> layer_nd(layer_param);
+    layer_nd.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    ASSERT_EQ(1, layer_nd.blobs().size());
+    copy_diff = false; reshape = false;
+    layer_nd.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
+    layer_nd.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    copy_diff = false; reshape = true;
+    result_nd.CopyFrom(*this->blob_top_, copy_diff, reshape);
+    // Copy pre-generated top diff into actual top diff;
+    // do Backward and save result in backward_result_nd.
+    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
+    caffe_copy(top_diff.count(), top_diff.cpu_data(),
+               this->blob_top_->mutable_cpu_diff());
+    layer_nd.Backward(this->blob_top_vec_, propagate_down,
+                      this->blob_bottom_vec_);
+    copy_diff = true; reshape = true;
+    backward_result_nd.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
+    backward_weight_result_nd.CopyFrom(weights, copy_diff, reshape);
+  }
+  ASSERT_EQ(result_nd.count(), result_2d.count());
+  for (int i = 0; i < result_2d.count(); ++i)  {
+    EXPECT_NEAR(result_2d.cpu_data()[i], result_nd.cpu_data()[i], 1e-4);
+  }
+  ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count());
+  for (int i = 0; i < backward_result_2d.count(); ++i) {
+    EXPECT_EQ(backward_result_2d.cpu_diff()[i],
+              backward_result_nd.cpu_diff()[i]);
+  }
+  ASSERT_EQ(backward_weight_result_nd.count(),
+            backward_weight_result_2d.count());
+  for (int i = 0; i < backward_weight_result_2d.count(); ++i) {
+    EXPECT_EQ(backward_weight_result_2d.cpu_diff()[i],
+              backward_weight_result_nd.cpu_diff()[i]);
+  }
+}
+
+#endif
+
 }  // namespace caffe

From fb3146363963fa494d1e7488890cac3d2a141c8f Mon Sep 17 00:00:00 2001
From: Bo Wang <david.b.wang@gmail.com>
Date: Thu, 14 Sep 2017 15:03:21 -0700
Subject: [PATCH 089/144] Fix format

---
 include/caffe/layers/cudnn_deconv_layer.hpp | 6 +++---
 src/caffe/test/test_deconvolution_layer.cpp | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/caffe/layers/cudnn_deconv_layer.hpp b/include/caffe/layers/cudnn_deconv_layer.hpp
index 34095e5c44d..12799e5b8ef 100644
--- a/include/caffe/layers/cudnn_deconv_layer.hpp
+++ b/include/caffe/layers/cudnn_deconv_layer.hpp
@@ -23,7 +23,7 @@ namespace caffe {
 */
 template <typename Dtype>
 class CuDNNDeconvolutionLayer : public DeconvolutionLayer<Dtype> {
-public:
+ public:
   explicit CuDNNDeconvolutionLayer(const LayerParameter& param)
     : DeconvolutionLayer<Dtype>(param), handles_setup_(false) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
@@ -32,7 +32,7 @@ class CuDNNDeconvolutionLayer : public DeconvolutionLayer<Dtype> {
                        const vector<Blob<Dtype>*>& top);
   virtual ~CuDNNDeconvolutionLayer();
 
-protected:
+ protected:
   virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
                            const vector<Blob<Dtype>*>& top);
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
@@ -65,4 +65,4 @@ class CuDNNDeconvolutionLayer : public DeconvolutionLayer<Dtype> {
 
 }  // namespace caffe
 
-#endif // CAFFE_CUDNN_DECONV_LAYER_HPP_
+#endif  // CAFFE_CUDNN_DECONV_LAYER_HPP_
diff --git a/src/caffe/test/test_deconvolution_layer.cpp b/src/caffe/test/test_deconvolution_layer.cpp
index 0067907165f..1a022d6f7bc 100644
--- a/src/caffe/test/test_deconvolution_layer.cpp
+++ b/src/caffe/test/test_deconvolution_layer.cpp
@@ -5,8 +5,8 @@
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
 #include "caffe/filler.hpp"
-#include "caffe/layers/deconv_layer.hpp"
 #include "caffe/layers/cudnn_deconv_layer.hpp"
+#include "caffe/layers/deconv_layer.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
 #include "caffe/test/test_gradient_check_util.hpp"

From 4705a377ea613de6bc3a9e26d83d58c902ea96ea Mon Sep 17 00:00:00 2001
From: "Jonathan R. Williford" <jonathan.r.williford+github@gmail.com>
Date: Fri, 15 Sep 2017 11:38:16 +0200
Subject: [PATCH 090/144] Packages needed by Ubuntu 16.04 also

This line is needed for Ubuntu 16.04:

    sudo apt-get install libgflags-dev libgoogle-glog-dev liblmdb-dev

For reference:

* https://github.com/BVLC/caffe/wiki/Ubuntu-16.04-or-15.10-Installation-Guide
* https://youtu.be/DnIs4DRjNL4
---
 docs/install_apt.md | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/docs/install_apt.md b/docs/install_apt.md
index b6cb1c2d6f7..e361a92da3c 100644
--- a/docs/install_apt.md
+++ b/docs/install_apt.md
@@ -40,6 +40,7 @@ Continue with [compilation](installation.html#compilation).
 
     sudo apt-get install libprotobuf-dev libleveldb-dev libsnappy-dev libopencv-dev libhdf5-serial-dev protobuf-compiler
     sudo apt-get install --no-install-recommends libboost-all-dev
+    sudo apt-get install libgflags-dev libgoogle-glog-dev liblmdb-dev
 
 **CUDA**: Install by `apt-get` or the NVIDIA `.run` package.
 The NVIDIA package tends to follow more recent library and driver versions, but the installation is more manual.
@@ -54,12 +55,6 @@ This can be skipped for CPU-only installation.
 
 CUDA 8 is required on Ubuntu 16.04.
 
-**Remaining dependencies, 14.04**
-
-Everything is packaged in 14.04.
-
-    sudo apt-get install libgflags-dev libgoogle-glog-dev liblmdb-dev
-
 **Remaining dependencies, 12.04**
 
 These dependencies need manual installation in 12.04.

From 888597e679f1846d549f1119fe88bc196edbd0dd Mon Sep 17 00:00:00 2001
From: Noiredd <snowball91b@gmail.com>
Date: Mon, 2 Oct 2017 14:39:31 +0200
Subject: [PATCH 091/144] Fixed bilinear filler, added tests

---
 include/caffe/filler.hpp       |  6 ++---
 src/caffe/test/test_filler.cpp | 43 +++++++++++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index dad9ad46b3b..bb92ded780f 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -250,10 +250,10 @@ class BilinearFiller : public Filler<Dtype> {
     CHECK_EQ(blob->width(), blob->height()) << "Filter must be square";
     Dtype* data = blob->mutable_cpu_data();
     int f = ceil(blob->width() / 2.);
-    float c = (2 * f - 1 - f % 2) / (2. * f);
+    Dtype c = (blob->width() - 1) / (2. * f);
     for (int i = 0; i < blob->count(); ++i) {
-      float x = i % blob->width();
-      float y = (i / blob->width()) % blob->height();
+      Dtype x = i % blob->width();
+      Dtype y = (i / blob->width()) % blob->height();
       data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
     }
     CHECK_EQ(this->filler_param_.sparse(), -1)
diff --git a/src/caffe/test/test_filler.cpp b/src/caffe/test/test_filler.cpp
index 26e9b217e35..f84d707baa0 100644
--- a/src/caffe/test/test_filler.cpp
+++ b/src/caffe/test/test_filler.cpp
@@ -29,7 +29,7 @@ TYPED_TEST(ConstantFillerTest, TestFill) {
   const int count = this->blob_->count();
   const TypeParam* data = this->blob_->cpu_data();
   for (int i = 0; i < count; ++i) {
-    EXPECT_GE(data[i], this->filler_param_.value());
+    EXPECT_EQ(data[i], this->filler_param_.value());
   }
 }
 
@@ -238,4 +238,45 @@ TYPED_TEST(MSRAFillerTest, TestFillAverage) {
   this->test_params(FillerParameter_VarianceNorm_AVERAGE, n);
 }
 
+template <typename Dtype>
+class BilinearFillerTest : public ::testing::Test {
+ protected:
+  BilinearFillerTest() : filler_param_() {}
+  virtual void test_params(const int n) {
+    this->blob_ = new Blob<Dtype>(1000, 2, n, n);
+    this->filler_.reset(new BilinearFiller<Dtype>(this->filler_param_));
+    this->filler_->Fill(blob_);
+    EXPECT_TRUE(this->blob_);
+    const int outer_num = this->blob_->count(0, 2);
+    const int inner_num = this->blob_->count(2, 4);
+    const Dtype* data = this->blob_->cpu_data();
+    int f = ceil(this->blob_->width() / 2.);
+    Dtype c = (this->blob_->width() - 1) / (2. * f);
+    for (int i = 0; i < outer_num; ++i) {
+      for (int j = 0; j < inner_num; ++j) {
+        Dtype x = j % this->blob_->width();
+        Dtype y = (j / this->blob_->width()) % this->blob_->height();
+        Dtype expected_value = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
+        const Dtype actual_value = data[i * inner_num + j];
+        EXPECT_NEAR(expected_value, actual_value, 0.01);
+      }
+    }
+  }
+  virtual ~BilinearFillerTest() { delete blob_; }
+  Blob<Dtype>* blob_;
+  FillerParameter filler_param_;
+  shared_ptr<BilinearFiller<Dtype> > filler_;
+};
+
+TYPED_TEST_CASE(BilinearFillerTest, TestDtypes);
+
+TYPED_TEST(BilinearFillerTest, TestFillOdd) {
+  const int n = 7;
+  this->test_params(n);
+}
+TYPED_TEST(BilinearFillerTest, TestFillEven) {
+  const int n = 6;
+  this->test_params(n);
+}
+
 }  // namespace caffe

From fac74347106ea16140ace828e8d278716e4d4742 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Tue, 3 Oct 2017 14:20:13 -0700
Subject: [PATCH 092/144] [docs] fix link to `AbsVal` layer

---
 docs/tutorial/layers.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tutorial/layers.md b/docs/tutorial/layers.md
index 2faacc5836d..78a46f3a7ee 100644
--- a/docs/tutorial/layers.md
+++ b/docs/tutorial/layers.md
@@ -87,7 +87,7 @@ Layers:
 * [ELU](layers/elu.html) - exponential linear rectification.
 * [Sigmoid](layers/sigmoid.html)
 * [TanH](layers/tanh.html)
-* [Absolute Value](layers/abs.html)
+* [Absolute Value](layers/absval.html)
 * [Power](layers/power.html) - f(x) = (shift + scale * x) ^ power.
 * [Exp](layers/exp.html) - f(x) = base ^ (shift + scale * x).
 * [Log](layers/log.html) - f(x) = log(x).

From e0feb7d72ce54ee4f9e84792418b4c59c4013d5f Mon Sep 17 00:00:00 2001
From: Finnian Anderson <get@finnian.io>
Date: Tue, 10 Oct 2017 12:15:13 +0100
Subject: [PATCH 093/144] Fix default mode warning in io.resize_image

Signed-off-by: Finnian Anderson <get@finnian.io>
---
 python/caffe/io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/caffe/io.py b/python/caffe/io.py
index 966c164cffd..d61f765b894 100644
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -323,7 +323,7 @@ def resize_image(im, new_dims, interp_order=1):
             # skimage is fast but only understands {1,3} channel images
             # in [0, 1].
             im_std = (im - im_min) / (im_max - im_min)
-            resized_std = resize(im_std, new_dims, order=interp_order)
+            resized_std = resize(im_std, new_dims, order=interp_order, mode='constant')
             resized_im = resized_std * (im_max - im_min) + im_min
         else:
             # the image is a constant -- avoid divide by 0

From 62e0c8559045cb2b5a12e0d6c41acd25d4122630 Mon Sep 17 00:00:00 2001
From: Shai <shai@magisto.com>
Date: Thu, 10 Aug 2017 10:07:19 +0300
Subject: [PATCH 094/144] upgrading Accuracy layer: (1) efficient CPU
 implementation O(L) for top_k, no need for fancy priority_queue etc. (2) GPU
 implementation

---
 include/caffe/layers/accuracy_layer.hpp |   4 +
 src/caffe/layers/accuracy_layer.cpp     |  33 ++-
 src/caffe/layers/accuracy_layer.cu      | 147 ++++++++++
 src/caffe/test/test_accuracy_layer.cpp  | 360 +++++++++++++-----------
 4 files changed, 364 insertions(+), 180 deletions(-)
 create mode 100644 src/caffe/layers/accuracy_layer.cu

diff --git a/include/caffe/layers/accuracy_layer.hpp b/include/caffe/layers/accuracy_layer.hpp
index a9ad3225149..dd2247b9e4d 100644
--- a/include/caffe/layers/accuracy_layer.hpp
+++ b/include/caffe/layers/accuracy_layer.hpp
@@ -68,6 +68,8 @@ class AccuracyLayer : public Layer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
 
 
   /// @brief Not implemented -- AccuracyLayer cannot be used as a loss.
@@ -77,6 +79,8 @@ class AccuracyLayer : public Layer<Dtype> {
       if (propagate_down[i]) { NOT_IMPLEMENTED; }
     }
   }
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   int label_axis_, outer_num_, inner_num_;
 
diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
index 4eddbb5c850..392829e6db8 100644
--- a/src/caffe/layers/accuracy_layer.cpp
+++ b/src/caffe/layers/accuracy_layer.cpp
@@ -52,8 +52,6 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   const Dtype* bottom_label = bottom[1]->cpu_data();
   const int dim = bottom[0]->count() / outer_num_;
   const int num_labels = bottom[0]->shape(label_axis_);
-  vector<Dtype> maxval(top_k_+1);
-  vector<int> max_id(top_k_+1);
   if (top.size() > 1) {
     caffe_set(nums_buffer_.count(), Dtype(0), nums_buffer_.mutable_cpu_data());
     caffe_set(top[1]->count(), Dtype(0), top[1]->mutable_cpu_data());
@@ -66,25 +64,22 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       if (has_ignore_label_ && label_value == ignore_label_) {
         continue;
       }
-      if (top.size() > 1) ++nums_buffer_.mutable_cpu_data()[label_value];
       DCHECK_GE(label_value, 0);
       DCHECK_LT(label_value, num_labels);
+      if (top.size() > 1) ++nums_buffer_.mutable_cpu_data()[label_value];
+      const Dtype prob_of_true_class = bottom_data[i * dim
+                                                   + label_value * inner_num_
+                                                   + j];
+      int num_better_predictions = -1;  // true_class also counts as "better"
       // Top-k accuracy
-      std::vector<std::pair<Dtype, int> > bottom_data_vector;
-      for (int k = 0; k < num_labels; ++k) {
-        bottom_data_vector.push_back(std::make_pair(
-            bottom_data[i * dim + k * inner_num_ + j], k));
+      for (int k = 0; k < num_labels && num_better_predictions < top_k_; ++k) {
+        num_better_predictions +=
+          (bottom_data[i * dim + k * inner_num_ + j] >= prob_of_true_class);
       }
-      std::partial_sort(
-          bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
-          bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
-      // check if true label is in top k predictions
-      for (int k = 0; k < top_k_; k++) {
-        if (bottom_data_vector[k].second == label_value) {
-          ++accuracy;
-          if (top.size() > 1) ++top[1]->mutable_cpu_data()[label_value];
-          break;
-        }
+      // check if there are less than top_k_ predictions
+      if (num_better_predictions < top_k_) {
+        ++accuracy;
+        if (top.size() > 1) ++top[1]->mutable_cpu_data()[label_value];
       }
       ++count;
     }
@@ -102,6 +97,10 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   // Accuracy layer should not be used as a loss function.
 }
 
+#ifdef CPU_ONLY
+STUB_GPU(AccuracyLayer);
+#endif
+
 INSTANTIATE_CLASS(AccuracyLayer);
 REGISTER_LAYER_CLASS(Accuracy);
 
diff --git a/src/caffe/layers/accuracy_layer.cu b/src/caffe/layers/accuracy_layer.cu
new file mode 100644
index 00000000000..a8cff936ccb
--- /dev/null
+++ b/src/caffe/layers/accuracy_layer.cu
@@ -0,0 +1,147 @@
+#include <vector>
+
+#include "caffe/layers/accuracy_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void AccuracyForwardGPU(const int nthreads,
+          const Dtype* bottom_data, const Dtype* label, Dtype* acc,
+          const int num, const int dim, const int spatial_dim,
+          const int num_labels, const int top_k,
+          const bool has_ignore_label_, const int ignore_label_,
+          Dtype* counts) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / spatial_dim;
+    const int s = index % spatial_dim;
+    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+    const Dtype prob_of_true_class = bottom_data[n * dim
+                                                 + label_value * spatial_dim
+                                                 + s];
+    int num_better_predictions = -1;  // true_class also counts as "better"
+    if (has_ignore_label_ && label_value == ignore_label_) {
+      acc[index] = 0;
+      counts[index] = 0;
+    } else {
+      for (int k = 0; k < num_labels & num_better_predictions < top_k; k++) {
+        num_better_predictions +=
+          (bottom_data[n * dim + k * spatial_dim + s] >= prob_of_true_class);
+      }
+      acc[index] = (num_better_predictions < top_k);
+      counts[index] = 1;
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void AccuracyForwardWithPerClassGPU(const int nthreads,
+          const Dtype* bottom_data, const Dtype* label,
+          Dtype* acc, Dtype* counts,
+          const int num, const int dim, const int spatial_dim,
+          const int num_labels, const int top_k,
+          const bool has_ignore_label_, const int ignore_label_) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / spatial_dim;
+    const int s = index % spatial_dim;
+    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+    const Dtype prob_of_true_class = bottom_data[n * dim
+                                                 + label_value * spatial_dim
+                                                 + s];
+    if (has_ignore_label_ && label_value == ignore_label_) {
+      // nothing to be done.
+    } else {
+      int num_better_predictions = -1;  // true_class also counts as "better"
+      for (int k = 0; k < num_labels & num_better_predictions < top_k; k++) {
+        num_better_predictions +=
+          (bottom_data[n * dim + k * spatial_dim + s] >= prob_of_true_class);
+      }
+      acc[label_value*nthreads + index] += (num_better_predictions < top_k);
+      counts[label_value*nthreads + index] = 1;
+    }
+  }
+}
+
+template <typename Dtype>
+void AccuracyLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const Dtype* bottom_label = bottom[1]->gpu_data();
+  const int dim = bottom[0]->count() / outer_num_;
+  const int num_labels = bottom[0]->shape(label_axis_);
+  const int nthreads = outer_num_ * inner_num_;
+  // Since this memory is not used for anything,
+  // we use it here to avoid having to allocate new GPU
+  // memory to accumulate intermediate results in the kernel.
+  Dtype* acc_data = bottom[0]->mutable_gpu_diff();
+  if (top.size() == 1) {
+    // simple case - report only global accuracy.
+
+    // Similarly, this memory is never used elsewhere, and thus we can use it
+    // to avoid having to allocate additional GPU memory.
+    Dtype* counts = bottom[1]->mutable_gpu_diff();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    AccuracyForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
+        CAFFE_CUDA_NUM_THREADS>>>(nthreads, bottom_data, bottom_label,
+        acc_data, outer_num_, dim, inner_num_, num_labels, top_k_,
+        has_ignore_label_, ignore_label_, counts);
+    Dtype acc;
+    caffe_gpu_asum(nthreads, acc_data, &acc);
+    Dtype valid_count;
+    caffe_gpu_asum(nthreads, counts, &valid_count);
+    if (valid_count > 0) {
+      top[0]->mutable_cpu_data()[0] = acc / valid_count;
+    } else {
+      top[0]->mutable_cpu_data()[0] = 0;
+    }
+  } else {
+    // need to report per-class accuracy as well
+
+    // allocate space for more detailed "counts"
+    nums_buffer_.ReshapeLike(*bottom[0]);
+    Dtype* counts = nums_buffer_.mutable_gpu_data();
+
+    caffe_gpu_set(bottom[0]->count(), Dtype(0), acc_data);
+    caffe_gpu_set(nums_buffer_.count(), Dtype(0), counts);
+
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    AccuracyForwardWithPerClassGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
+        CAFFE_CUDA_NUM_THREADS>>>(nthreads, bottom_data, bottom_label,
+        acc_data, counts, outer_num_, dim, inner_num_, num_labels, top_k_,
+        has_ignore_label_, ignore_label_);
+
+    // get the overall accuracy
+    Dtype acc;
+    caffe_gpu_asum(bottom[0]->count(), acc_data, &acc);
+    Dtype valid_count;
+    caffe_gpu_asum(nums_buffer_.count(), counts, &valid_count);
+    if (valid_count > 0) {
+      top[0]->mutable_cpu_data()[0] = acc / valid_count;
+    } else {
+      top[0]->mutable_cpu_data()[0] = 0;
+    }
+
+    // get per-class accuracy
+    Dtype* per_class_acc = top[1]->mutable_cpu_data();
+    for (int l = 0; l < num_labels; l++) {
+      caffe_gpu_asum(nthreads, acc_data + l*nthreads, per_class_acc+l);
+      caffe_gpu_asum(nthreads, counts + l*nthreads, &valid_count);
+      if (valid_count > 0) {
+        per_class_acc[l] /= valid_count;
+      } else {
+        per_class_acc[l] = 0;
+      }
+    }
+  }
+}
+
+
+template <typename Dtype>
+void AccuracyLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {  NOT_IMPLEMENTED;  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(AccuracyLayer);
+}  // namespace caffe
diff --git a/src/caffe/test/test_accuracy_layer.cpp b/src/caffe/test/test_accuracy_layer.cpp
index 6fe808bd5c5..e5cc9d5e85f 100644
--- a/src/caffe/test/test_accuracy_layer.cpp
+++ b/src/caffe/test/test_accuracy_layer.cpp
@@ -13,8 +13,10 @@
 
 namespace caffe {
 
-template <typename Dtype>
-class AccuracyLayerTest : public CPUDeviceTest<Dtype> {
+template <typename TypeParam>
+class AccuracyLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
  protected:
   AccuracyLayerTest()
       : blob_bottom_data_(new Blob<Dtype>()),
@@ -69,11 +71,12 @@ class AccuracyLayerTest : public CPUDeviceTest<Dtype> {
   int top_k_;
 };
 
-TYPED_TEST_CASE(AccuracyLayerTest, TestDtypes);
+TYPED_TEST_CASE(AccuracyLayerTest, TestDtypesAndDevices);
 
 TYPED_TEST(AccuracyLayerTest, TestSetup) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_->num(), 1);
   EXPECT_EQ(this->blob_top_->channels(), 1);
@@ -82,11 +85,12 @@ TYPED_TEST(AccuracyLayerTest, TestSetup) {
 }
 
 TYPED_TEST(AccuracyLayerTest, TestSetupTopK) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   AccuracyParameter* accuracy_param =
       layer_param.mutable_accuracy_param();
   accuracy_param->set_top_k(5);
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_->num(), 1);
   EXPECT_EQ(this->blob_top_->channels(), 1);
@@ -95,8 +99,9 @@ TYPED_TEST(AccuracyLayerTest, TestSetupTopK) {
 }
 
 TYPED_TEST(AccuracyLayerTest, TestSetupOutputPerClass) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
   EXPECT_EQ(this->blob_top_->num(), 1);
   EXPECT_EQ(this->blob_top_->channels(), 1);
@@ -108,33 +113,39 @@ TYPED_TEST(AccuracyLayerTest, TestSetupOutputPerClass) {
   EXPECT_EQ(this->blob_top_per_class_->width(), 1);
 }
 
-TYPED_TEST(AccuracyLayerTest, TestForwardCPU) {
+TYPED_TEST(AccuracyLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  TypeParam max_value;
-  int max_id;
-  int num_correct_labels = 0;
-  for (int i = 0; i < 100; ++i) {
-    max_value = -FLT_MAX;
-    max_id = 0;
-    for (int j = 0; j < 10; ++j) {
-      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
-        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
-        max_id = j;
+
+  // repeat the forward
+  for (int iter = 0; iter < 3; iter++) {
+    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+    Dtype max_value;
+    int max_id;
+    int num_correct_labels = 0;
+    for (int i = 0; i < 100; ++i) {
+      max_value = -FLT_MAX;
+      max_id = 0;
+      for (int j = 0; j < 10; ++j) {
+        if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
+          max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+          max_id = j;
+        }
+      }
+      if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+        ++num_correct_labels;
       }
     }
-    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      ++num_correct_labels;
-    }
+    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+                num_correct_labels / Dtype(100.0), 1e-4);
   }
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / 100.0, 1e-4);
 }
 
 TYPED_TEST(AccuracyLayerTest, TestForwardWithSpatialAxes) {
+  typedef typename TypeParam::Dtype Dtype;
   this->blob_bottom_data_->Reshape(2, 10, 4, 5);
   vector<int> label_shape(3);
   label_shape[0] = 2; label_shape[1] = 4; label_shape[2] = 5;
@@ -142,195 +153,218 @@ TYPED_TEST(AccuracyLayerTest, TestForwardWithSpatialAxes) {
   this->FillBottoms();
   LayerParameter layer_param;
   layer_param.mutable_accuracy_param()->set_axis(1);
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  TypeParam max_value;
-  const int num_labels = this->blob_bottom_label_->count();
-  int max_id;
-  int num_correct_labels = 0;
-  vector<int> label_offset(3);
-  for (int n = 0; n < this->blob_bottom_data_->num(); ++n) {
-    for (int h = 0; h < this->blob_bottom_data_->height(); ++h) {
-      for (int w = 0; w < this->blob_bottom_data_->width(); ++w) {
-        max_value = -FLT_MAX;
-        max_id = 0;
-        for (int c = 0; c < this->blob_bottom_data_->channels(); ++c) {
-          const TypeParam pred_value =
-              this->blob_bottom_data_->data_at(n, c, h, w);
-          if (pred_value > max_value) {
-            max_value = pred_value;
-            max_id = c;
+
+  // repeat the forward
+  for (int iter = 0; iter < 3; iter++) {
+    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+    Dtype max_value;
+    const int num_labels = this->blob_bottom_label_->count();
+    int max_id;
+    int num_correct_labels = 0;
+    vector<int> label_offset(3);
+    for (int n = 0; n < this->blob_bottom_data_->num(); ++n) {
+      for (int h = 0; h < this->blob_bottom_data_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_data_->width(); ++w) {
+          max_value = -FLT_MAX;
+          max_id = 0;
+          for (int c = 0; c < this->blob_bottom_data_->channels(); ++c) {
+            const Dtype pred_value =
+                this->blob_bottom_data_->data_at(n, c, h, w);
+            if (pred_value > max_value) {
+              max_value = pred_value;
+              max_id = c;
+            }
+          }
+          label_offset[0] = n; label_offset[1] = h; label_offset[2] = w;
+          const int correct_label =
+              static_cast<int>(this->blob_bottom_label_->data_at(label_offset));
+          if (max_id == correct_label) {
+            ++num_correct_labels;
           }
-        }
-        label_offset[0] = n; label_offset[1] = h; label_offset[2] = w;
-        const int correct_label =
-            static_cast<int>(this->blob_bottom_label_->data_at(label_offset));
-        if (max_id == correct_label) {
-          ++num_correct_labels;
         }
       }
     }
+    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+                num_correct_labels / Dtype(num_labels), 1e-4);
   }
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / TypeParam(num_labels), 1e-4);
 }
 
 TYPED_TEST(AccuracyLayerTest, TestForwardIgnoreLabel) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  const TypeParam kIgnoreLabelValue = -1;
+  const Dtype kIgnoreLabelValue = -1;
   layer_param.mutable_accuracy_param()->set_ignore_label(kIgnoreLabelValue);
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   // Manually set some labels to the ignore label value (-1).
   this->blob_bottom_label_->mutable_cpu_data()[2] = kIgnoreLabelValue;
   this->blob_bottom_label_->mutable_cpu_data()[5] = kIgnoreLabelValue;
   this->blob_bottom_label_->mutable_cpu_data()[32] = kIgnoreLabelValue;
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  TypeParam max_value;
-  int max_id;
-  int num_correct_labels = 0;
-  int count = 0;
-  for (int i = 0; i < 100; ++i) {
-    if (kIgnoreLabelValue == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      continue;
-    }
-    ++count;
-    max_value = -FLT_MAX;
-    max_id = 0;
-    for (int j = 0; j < 10; ++j) {
-      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
-        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
-        max_id = j;
+
+  // repeat the forward
+  for (int iter = 0; iter < 3; iter++) {
+    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+    Dtype max_value;
+    int max_id;
+    int num_correct_labels = 0;
+    int count = 0;
+    for (int i = 0; i < 100; ++i) {
+      if (kIgnoreLabelValue == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+        continue;
+      }
+      ++count;
+      max_value = -FLT_MAX;
+      max_id = 0;
+      for (int j = 0; j < 10; ++j) {
+        if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
+          max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+          max_id = j;
+        }
+      }
+      if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+        ++num_correct_labels;
       }
     }
-    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      ++num_correct_labels;
-    }
+    EXPECT_EQ(count, 97);  // We set 3 out of 100 labels to kIgnoreLabelValue.
+    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+                num_correct_labels / Dtype(count), 1e-4);
   }
-  EXPECT_EQ(count, 97);  // We set 3 out of 100 labels to kIgnoreLabelValue.
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / TypeParam(count), 1e-4);
 }
 
-TYPED_TEST(AccuracyLayerTest, TestForwardCPUTopK) {
+TYPED_TEST(AccuracyLayerTest, TestForwardTopK) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   AccuracyParameter* accuracy_param = layer_param.mutable_accuracy_param();
   accuracy_param->set_top_k(this->top_k_);
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  TypeParam current_value;
-  int current_rank;
-  int num_correct_labels = 0;
-  for (int i = 0; i < 100; ++i) {
-    for (int j = 0; j < 10; ++j) {
-      current_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
-      current_rank = 0;
-      for (int k = 0; k < 10; ++k) {
-        if (this->blob_bottom_data_->data_at(i, k, 0, 0) > current_value) {
-          ++current_rank;
+
+  // repeat the forward
+  for (int iter = 0; iter < 3; iter++) {
+    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+    Dtype current_value;
+    int current_rank;
+    int num_correct_labels = 0;
+    for (int i = 0; i < 100; ++i) {
+      for (int j = 0; j < 10; ++j) {
+        current_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+        current_rank = 0;
+        for (int k = 0; k < 10; ++k) {
+          if (this->blob_bottom_data_->data_at(i, k, 0, 0) > current_value) {
+            ++current_rank;
+          }
+        }
+        if (current_rank < this->top_k_ &&
+            j == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+          ++num_correct_labels;
         }
-      }
-      if (current_rank < this->top_k_ &&
-          j == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-        ++num_correct_labels;
       }
     }
-  }
 
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / 100.0, 1e-4);
+    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+                num_correct_labels / Dtype(100.0), 1e-4);
+  }
 }
 
-TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClass) {
+TYPED_TEST(AccuracyLayerTest, TestForwardPerClass) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
-
-  TypeParam max_value;
-  int max_id;
-  int num_correct_labels = 0;
-  const int num_class = this->blob_top_per_class_->num();
-  vector<int> correct_per_class(num_class, 0);
-  vector<int> num_per_class(num_class, 0);
-  for (int i = 0; i < 100; ++i) {
-    max_value = -FLT_MAX;
-    max_id = 0;
-    for (int j = 0; j < 10; ++j) {
-      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
-        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
-        max_id = j;
+  // repeat the forward
+  for (int iter = 0; iter < 3; iter++) {
+    layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
+
+    Dtype max_value;
+    int max_id;
+    int num_correct_labels = 0;
+    const int num_class = this->blob_top_per_class_->num();
+    vector<int> correct_per_class(num_class, 0);
+    vector<int> num_per_class(num_class, 0);
+    for (int i = 0; i < 100; ++i) {
+      max_value = -FLT_MAX;
+      max_id = 0;
+      for (int j = 0; j < 10; ++j) {
+        if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
+          max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+          max_id = j;
+        }
+      }
+      ++num_per_class[this->blob_bottom_label_->data_at(i, 0, 0, 0)];
+      if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+        ++num_correct_labels;
+        ++correct_per_class[max_id];
       }
     }
-    ++num_per_class[this->blob_bottom_label_->data_at(i, 0, 0, 0)];
-    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      ++num_correct_labels;
-      ++correct_per_class[max_id];
+    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+                num_correct_labels / 100.0, 1e-4);
+    for (int i = 0; i < num_class; ++i) {
+      Dtype accuracy_per_class = (num_per_class[i] > 0 ?
+         static_cast<Dtype>(correct_per_class[i]) / num_per_class[i] : 0);
+      EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
+                  accuracy_per_class, 1e-4);
     }
   }
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / 100.0, 1e-4);
-  for (int i = 0; i < num_class; ++i) {
-    TypeParam accuracy_per_class = (num_per_class[i] > 0 ?
-       static_cast<TypeParam>(correct_per_class[i]) / num_per_class[i] : 0);
-    EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
-                accuracy_per_class, 1e-4);
-  }
 }
 
 
-TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClassWithIgnoreLabel) {
+TYPED_TEST(AccuracyLayerTest, TestForwardPerClassWithIgnoreLabel) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  const TypeParam kIgnoreLabelValue = -1;
+  const Dtype kIgnoreLabelValue = -1;
   layer_param.mutable_accuracy_param()->set_ignore_label(kIgnoreLabelValue);
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   // Manually set some labels to the ignore label value (-1).
   this->blob_bottom_label_->mutable_cpu_data()[2] = kIgnoreLabelValue;
   this->blob_bottom_label_->mutable_cpu_data()[5] = kIgnoreLabelValue;
   this->blob_bottom_label_->mutable_cpu_data()[32] = kIgnoreLabelValue;
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
-
-  TypeParam max_value;
-  int max_id;
-  int num_correct_labels = 0;
-  const int num_class = this->blob_top_per_class_->num();
-  vector<int> correct_per_class(num_class, 0);
-  vector<int> num_per_class(num_class, 0);
-  int count = 0;
-  for (int i = 0; i < 100; ++i) {
-    if (kIgnoreLabelValue == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      continue;
-    }
-    ++count;
-    max_value = -FLT_MAX;
-    max_id = 0;
-    for (int j = 0; j < 10; ++j) {
-      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
-        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
-        max_id = j;
+
+  // repeat the forward
+  for (int iter = 0; iter < 3; iter++) {
+    layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
+
+    Dtype max_value;
+    int max_id;
+    int num_correct_labels = 0;
+    const int num_class = this->blob_top_per_class_->num();
+    vector<int> correct_per_class(num_class, 0);
+    vector<int> num_per_class(num_class, 0);
+    int count = 0;
+    for (int i = 0; i < 100; ++i) {
+      if (kIgnoreLabelValue == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+        continue;
+      }
+      ++count;
+      max_value = -FLT_MAX;
+      max_id = 0;
+      for (int j = 0; j < 10; ++j) {
+        if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
+          max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+          max_id = j;
+        }
+      }
+      ++num_per_class[this->blob_bottom_label_->data_at(i, 0, 0, 0)];
+      if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+        ++num_correct_labels;
+        ++correct_per_class[max_id];
       }
     }
-    ++num_per_class[this->blob_bottom_label_->data_at(i, 0, 0, 0)];
-    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      ++num_correct_labels;
-      ++correct_per_class[max_id];
+    EXPECT_EQ(count, 97);
+    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+                num_correct_labels / Dtype(count), 1e-4);
+    for (int i = 0; i < 10; ++i) {
+      Dtype accuracy_per_class = (num_per_class[i] > 0 ?
+         static_cast<Dtype>(correct_per_class[i]) / num_per_class[i] : 0);
+      EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
+                  accuracy_per_class, 1e-4);
     }
   }
-  EXPECT_EQ(count, 97);
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / TypeParam(count), 1e-4);
-  for (int i = 0; i < 10; ++i) {
-    TypeParam accuracy_per_class = (num_per_class[i] > 0 ?
-       static_cast<TypeParam>(correct_per_class[i]) / num_per_class[i] : 0);
-    EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
-                accuracy_per_class, 1e-4);
-  }
 }
 
 }  // namespace caffe

From cd2a0140d60742d4f66cc1668a2554bdc7a1ca3c Mon Sep 17 00:00:00 2001
From: Icyblade Dai <icyblade.py@gmail.com>
Date: Wed, 11 Oct 2017 15:29:21 +0800
Subject: [PATCH 095/144] add supports for cuDNN v7

---
 docs/installation.md           | 4 ++--
 include/caffe/util/cudnn.hpp   | 6 ++++++
 scripts/travis/install-deps.sh | 2 +-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/docs/installation.md b/docs/installation.md
index 42f1d0ce09b..6104cc247a8 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -42,14 +42,14 @@ Optional dependencies:
 
 * [OpenCV](http://opencv.org/) >= 2.4 including 3.0
 * IO libraries: `lmdb`, `leveldb` (note: leveldb requires `snappy`)
-* cuDNN for GPU acceleration (v6)
+* cuDNN for GPU acceleration (v7)
 
 Pycaffe and Matcaffe interfaces have their own natural needs.
 
 * For Python Caffe:  `Python 2.7` or `Python 3.3+`, `numpy (>= 1.7)`, boost-provided `boost.python`
 * For MATLAB Caffe: MATLAB with the `mex` compiler.
 
-**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v6; older versions are supported in older Caffe.
+**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v7; older versions are supported in older Caffe.
 
 **CPU-only Caffe**: for cold-brewed CPU-only Caffe uncomment the `CPU_ONLY := 1` flag in `Makefile.config` to configure and build Caffe without CUDA. This is helpful for cloud or cluster deployment.
 
diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp
index 498cfe385de..cd3f93f6e28 100644
--- a/include/caffe/util/cudnn.hpp
+++ b/include/caffe/util/cudnn.hpp
@@ -44,6 +44,12 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
 #if CUDNN_VERSION_MIN(6, 0, 0)
     case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING:
       return "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING";
+#endif
+#if CUDNN_VERSION_MIN(7, 0, 0)
+    case CUDNN_STATUS_RUNTIME_IN_PROGRESS:
+      return "CUDNN_STATUS_RUNTIME_IN_PROGRESS";
+    case CUDNN_STATUS_RUNTIME_FP_OVERFLOW:
+      return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
 #endif
   }
   return "Unknown cudnn status";
diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
index 2fa2a74a486..abf9cf1ca70 100755
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@@ -106,7 +106,7 @@ if $WITH_CUDA ; then
   ln -s /usr/local/cuda-$CUDA_VERSION /usr/local/cuda
 
   if $WITH_CUDNN ; then
-    apt-get install -y --no-install-recommends libcudnn6-dev
+    apt-get install -y --no-install-recommends libcudnn7-dev
   fi
 fi
 

From 243cd8948520e83740be328466352b10e6983aec Mon Sep 17 00:00:00 2001
From: Noiredd <snowball91b@gmail.com>
Date: Wed, 11 Oct 2017 11:04:18 +0200
Subject: [PATCH 096/144] Add absolute tolerance to test_net.py to prevent
 random Travis fails

---
 python/caffe/test/test_net.py | 50 +++++++++++++++++------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/python/caffe/test/test_net.py b/python/caffe/test/test_net.py
index afd27690981..ee1d38c39db 100644
--- a/python/caffe/test/test_net.py
+++ b/python/caffe/test/test_net.py
@@ -72,41 +72,41 @@ def test_forward_backward(self):
         self.net.backward()
 
     def test_forward_start_end(self):
-        conv_blob=self.net.blobs['conv'];
-        ip_blob=self.net.blobs['ip_blob'];
-        sample_data=np.random.uniform(size=conv_blob.data.shape);
-        sample_data=sample_data.astype(np.float32);
-        conv_blob.data[:]=sample_data;
-        forward_blob=self.net.forward(start='ip',end='ip');
-        self.assertIn('ip_blob',forward_blob);
-
-        manual_forward=[];
+        conv_blob=self.net.blobs['conv']
+        ip_blob=self.net.blobs['ip_blob']
+        sample_data=np.random.uniform(size=conv_blob.data.shape)
+        sample_data=sample_data.astype(np.float32)
+        conv_blob.data[:]=sample_data
+        forward_blob=self.net.forward(start='ip',end='ip')
+        self.assertIn('ip_blob',forward_blob)
+
+        manual_forward=[]
         for i in range(0,conv_blob.data.shape[0]):
           dot=np.dot(self.net.params['ip'][0].data,
-                     conv_blob.data[i].reshape(-1));
-          manual_forward.append(dot+self.net.params['ip'][1].data);
-        manual_forward=np.array(manual_forward);
+                     conv_blob.data[i].reshape(-1))
+          manual_forward.append(dot+self.net.params['ip'][1].data)
+        manual_forward=np.array(manual_forward)
 
-        np.testing.assert_allclose(ip_blob.data,manual_forward,rtol=1e-3);
+        np.testing.assert_allclose(ip_blob.data,manual_forward,rtol=1e-3,atol=1e-5)
 
     def test_backward_start_end(self):
-        conv_blob=self.net.blobs['conv'];
-        ip_blob=self.net.blobs['ip_blob'];
+        conv_blob=self.net.blobs['conv']
+        ip_blob=self.net.blobs['ip_blob']
         sample_data=np.random.uniform(size=ip_blob.data.shape)
-        sample_data=sample_data.astype(np.float32);
-        ip_blob.diff[:]=sample_data;
-        backward_blob=self.net.backward(start='ip',end='ip');
-        self.assertIn('conv',backward_blob);
+        sample_data=sample_data.astype(np.float32)
+        ip_blob.diff[:]=sample_data
+        backward_blob=self.net.backward(start='ip',end='ip')
+        self.assertIn('conv',backward_blob)
 
-        manual_backward=[];
+        manual_backward=[]
         for i in range(0,conv_blob.data.shape[0]):
           dot=np.dot(self.net.params['ip'][0].data.transpose(),
-                     sample_data[i].reshape(-1));
-          manual_backward.append(dot);
-        manual_backward=np.array(manual_backward);
-        manual_backward=manual_backward.reshape(conv_blob.data.shape);
+                     sample_data[i].reshape(-1))
+          manual_backward.append(dot)
+        manual_backward=np.array(manual_backward)
+        manual_backward=manual_backward.reshape(conv_blob.data.shape)
 
-        np.testing.assert_allclose(conv_blob.diff,manual_backward,rtol=1e-3);
+        np.testing.assert_allclose(conv_blob.diff,manual_backward,rtol=1e-3,atol=1e-5)
 
     def test_clear_param_diffs(self):
         # Run a forward/backward step to have non-zero diffs

From 79ddda7e931f90f1b648e5372ccbaf1a35e88fb5 Mon Sep 17 00:00:00 2001
From: Mikhail Antonenka <misha.antonenko.minsk@gmail.com>
Date: Tue, 17 Oct 2017 18:18:32 +0300
Subject: [PATCH 097/144] infogain loss: fix bottom blobs description

---
 include/caffe/layers/infogain_loss_layer.hpp | 21 ++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/include/caffe/layers/infogain_loss_layer.hpp b/include/caffe/layers/infogain_loss_layer.hpp
index edecde829ad..3b3caa27c38 100644
--- a/include/caffe/layers/infogain_loss_layer.hpp
+++ b/include/caffe/layers/infogain_loss_layer.hpp
@@ -13,20 +13,21 @@
 namespace caffe {
 
 /**
- * @brief A generalization of MultinomialLogisticLossLayer that takes an
+ * @brief A generalization of SoftmaxWithLossLayer that takes an
  *        "information gain" (infogain) matrix specifying the "value" of all label
  *        pairs.
  *
- * Equivalent to the MultinomialLogisticLossLayer if the infogain matrix is the
+ * Equivalent to the SoftmaxWithLossLayer if the infogain matrix is the
  * identity.
  *
  * @param bottom input Blob vector (length 2-3)
  *   -# @f$ (N \times C \times H \times W) @f$
- *      the predictions @f$ \hat{p} @f$, a Blob with values in
- *      @f$ [0, 1] @f$ indicating the predicted probability of each of the
- *      @f$ K = CHW @f$ classes.  Each prediction vector @f$ \hat{p}_n @f$
- *      should sum to 1 as in a probability distribution: @f$
- *      \forall n \sum\limits_{k=1}^K \hat{p}_{nk} = 1 @f$.
+ *      the predictions @f$ x @f$, a Blob with values in
+ *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
+ *      the @f$ K = CHW @f$ classes. This layer maps these scores to a
+ *      probability distribution over classes using the softmax function
+ *      @f$ \hat{p}_{nk} = \exp(x_{nk}) /
+ *      \left[\sum_{k'} \exp(x_{nk'})\right] @f$ (see SoftmaxLayer).
  *   -# @f$ (N \times 1 \times 1 \times 1) @f$
  *      the labels @f$ l @f$, an integer-valued Blob with values
  *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
@@ -35,7 +36,7 @@ namespace caffe {
  *      (\b optional) the infogain matrix @f$ H @f$.  This must be provided as
  *      the third bottom blob input if not provided as the infogain_mat in the
  *      InfogainLossParameter. If @f$ H = I @f$, this layer is equivalent to the
- *      MultinomialLogisticLossLayer.
+ *      SoftmaxWithLossLayer.
  * @param top output Blob vector (length 1)
  *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
  *      the computed infogain multinomial logistic loss: @f$ E =
@@ -98,8 +99,8 @@ class InfogainLossLayer : public LossLayer<Dtype> {
    *      infogain matrix, if provided as bottom[2])
    * @param bottom input Blob vector (length 2-3)
    *   -# @f$ (N \times C \times H \times W) @f$
-   *      the predictions @f$ \hat{p} @f$; Backward computes diff
-   *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
+   *      the predictions @f$ x @f$; Backward computes diff
+   *      @f$ \frac{\partial E}{\partial x} @f$
    *   -# @f$ (N \times 1 \times 1 \times 1) @f$
    *      the labels -- ignored as we can't compute their error gradients
    *   -# @f$ (1 \times 1 \times K \times K) @f$

From bfc638d502b8b991e6ef5aaaa1946049312e6336 Mon Sep 17 00:00:00 2001
From: Geunsik Lim <geunsik.lim@samsung.com>
Date: Thu, 29 Jun 2017 13:12:53 +0900
Subject: [PATCH 098/144] Fix: mean shape in compatible with input shape

This commit is to fix issue #5718.

* reference:
1. https://groups.google.com/forum/#!topic/caffe-users/nBpWJCcJoCU
2. https://stackoverflow.com/questions/28692209/using-gpu-despite-setting-cpu-only-yielding-unexpected-keyword-argument

Signed-off-by: Geunsik Lim <geunsik.lim@samsung.com>
---
 python/caffe/io.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/caffe/io.py b/python/caffe/io.py
index 966c164cffd..1efb7409603 100644
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -256,7 +256,12 @@ def set_mean(self, in_, mean):
             if len(ms) != 3:
                 raise ValueError('Mean shape invalid')
             if ms != self.inputs[in_][1:]:
-                raise ValueError('Mean shape incompatible with input shape.')
+                in_shape = self.inputs[in_][1:]
+                m_min, m_max = mean.min(), mean.max()
+                normal_mean = (mean - m_min) / (m_max - m_min)
+                mean = resize_image(normal_mean.transpose((1,2,0)),
+                        in_shape[1:]).transpose((2,0,1)) * \
+                        (m_max - m_min) + m_min
         self.mean[in_] = mean
 
     def set_input_scale(self, in_, scale):

From fbdc1e103bf6f55c9dabe94a2530fa62e65f506b Mon Sep 17 00:00:00 2001
From: sclarkson <sclarkson@users.noreply.github.com>
Date: Sat, 11 Nov 2017 18:20:11 -0800
Subject: [PATCH 099/144] Fix Makefile parallel builds missing protobuf header

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index c6d5685b140..c85c695acff 100644
--- a/Makefile
+++ b/Makefile
@@ -577,7 +577,7 @@ $(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR)
 	@ echo AR -o $@
 	$(Q)ar rcs $@ $(OBJS)
 
-$(BUILD_DIR)/%.o: %.cpp | $(ALL_BUILD_DIRS)
+$(BUILD_DIR)/%.o: %.cpp $(PROTO_GEN_HEADER) | $(ALL_BUILD_DIRS)
 	@ echo CXX $<
 	$(Q)$(CXX) $< $(CXXFLAGS) -c -o $@ 2> $@.$(WARNS_EXT) \
 		|| (cat $@.$(WARNS_EXT); exit 1)

From 95d000153dc668bf86f24eb46e8982701fa7c59a Mon Sep 17 00:00:00 2001
From: Noiredd <snowball91b@gmail.com>
Date: Tue, 28 Nov 2017 11:00:51 +0100
Subject: [PATCH 100/144] Makefile example comments for CUDA 9.0 compatibility

---
 Makefile.config.example | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile.config.example b/Makefile.config.example
index d552b38a97c..79905935f15 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -33,6 +33,7 @@ CUDA_DIR := /usr/local/cuda
 # CUDA architecture setting: going with all of them.
 # For CUDA < 6.0, comment the *_50 through *_61 lines for compatibility.
 # For CUDA < 8.0, comment the *_60 and *_61 lines for compatibility.
+# For CUDA >= 9.0, comment the *_20 and *_21 lines for compatibility.
 CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
 		-gencode arch=compute_20,code=sm_21 \
 		-gencode arch=compute_30,code=sm_30 \

From 3be5297ba3c512abca0dad4541ecb1f78d640542 Mon Sep 17 00:00:00 2001
From: Noiredd <snowball91b@gmail.com>
Date: Wed, 29 Nov 2017 11:48:29 +0100
Subject: [PATCH 101/144] Added count==0 safeguard to CPU accuracy calculation

---
 src/caffe/layers/accuracy_layer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
index 392829e6db8..b6d95b54a5d 100644
--- a/src/caffe/layers/accuracy_layer.cpp
+++ b/src/caffe/layers/accuracy_layer.cpp
@@ -86,7 +86,7 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   }
 
   // LOG(INFO) << "Accuracy: " << accuracy;
-  top[0]->mutable_cpu_data()[0] = accuracy / count;
+  top[0]->mutable_cpu_data()[0] = (count == 0) ? 0 : (accuracy / count);
   if (top.size() > 1) {
     for (int i = 0; i < top[1]->count(); ++i) {
       top[1]->mutable_cpu_data()[i] =

From c23b3563f0fa2999578c1a8b3f32dc9cdec5a037 Mon Sep 17 00:00:00 2001
From: YaYaB <bezzayassine@gmail.com>
Date: Tue, 12 Dec 2017 16:16:59 +0000
Subject: [PATCH 102/144] Add check values of gamma and stepsize to avoid
 unexplained core dump

---
 src/caffe/solvers/sgd_solver.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index ad6abe54a0a..1d52beb0636 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -30,12 +30,16 @@ Dtype SGDSolver<Dtype>::GetLearningRate() {
   if (lr_policy == "fixed") {
     rate = this->param_.base_lr();
   } else if (lr_policy == "step") {
+    CHECK_GT(this->param_.stepsize(), 0);
     this->current_step_ = this->iter_ / this->param_.stepsize();
+    CHECK_GE(this->param_.gamma(), 0);
     rate = this->param_.base_lr() *
         pow(this->param_.gamma(), this->current_step_);
   } else if (lr_policy == "exp") {
+    CHECK_GE(this->param_.gamma(), 0);
     rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_);
   } else if (lr_policy == "inv") {
+    CHECK_GE(this->param_.gamma(), 0);
     rate = this->param_.base_lr() *
         pow(Dtype(1) + this->param_.gamma() * this->iter_,
             - this->param_.power());
@@ -46,6 +50,7 @@ Dtype SGDSolver<Dtype>::GetLearningRate() {
       LOG(INFO) << "MultiStep Status: Iteration " <<
       this->iter_ << ", step = " << this->current_step_;
     }
+    CHECK_GE(this->param_.gamma(), 0);
     rate = this->param_.base_lr() *
         pow(this->param_.gamma(), this->current_step_);
   } else if (lr_policy == "poly") {
@@ -53,6 +58,8 @@ Dtype SGDSolver<Dtype>::GetLearningRate() {
         (Dtype(this->iter_) / Dtype(this->param_.max_iter())),
         this->param_.power());
   } else if (lr_policy == "sigmoid") {
+    CHECK_GE(this->param_.gamma(), 0);
+    CHECK_GT(this->param_.stepsize(), 0);
     rate = this->param_.base_lr() * (Dtype(1.) /
         (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) -
           Dtype(this->param_.stepsize())))));

From c98de53b7817c732b482c2fa810f09c260c58857 Mon Sep 17 00:00:00 2001
From: Pavel Grunt <pavel.grunt@gmail.com>
Date: Wed, 20 Dec 2017 13:07:47 +0100
Subject: [PATCH 103/144] Cuda.cmake: Fix a typo in a comment

---
 cmake/Cuda.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index b2b19e8b669..54e26fd5c8e 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -232,7 +232,7 @@ endfunction()
 ################################################################################################
 
 find_package(CUDA 5.5 QUIET)
-find_cuda_helper_libs(curand)  # cmake 2.8.7 compartibility which doesn't search for curand
+find_cuda_helper_libs(curand)  # cmake 2.8.7 compatibility which doesn't search for curand
 
 if(NOT CUDA_FOUND)
   return()

From 7d2ecf93a12ac823850904e48fb7d3c916b73934 Mon Sep 17 00:00:00 2001
From: Toby Thain <toby.thain@fivebox.com>
Date: Mon, 15 Jan 2018 13:16:05 -0500
Subject: [PATCH 104/144] Simplify pip invocation.

---
 docs/installation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/installation.md b/docs/installation.md
index 6104cc247a8..416b9d64ada 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -80,7 +80,7 @@ The main requirements are `numpy` and `boost.python` (provided by boost). `panda
 
 You can install the dependencies with
 
-    for req in $(cat requirements.txt); do pip install $req; done
+    pip install -r requirements.txt
 
 but we suggest first installing the [Anaconda](https://store.continuum.io/cshop/anaconda/) Python distribution, which provides most of the necessary packages, as well as the `hdf5` library dependency.
 

From ecdc289cc2dfbaa0e2b70ca545cb41a1fb55e79c Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Sat, 27 Jan 2018 16:37:06 -0800
Subject: [PATCH 105/144] docs: switch to official AWS AMI

---
 docs/installation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/installation.md b/docs/installation.md
index 6104cc247a8..2c4b30d392d 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -17,7 +17,7 @@ The official Makefile and `Makefile.config` build are complemented by a [communi
 - [RHEL / CentOS / Fedora installation](install_yum.html)
 - [Windows](https://github.com/BVLC/caffe/tree/windows) *see the Windows branch led by Guillaume Dumont*
 - [OpenCL](https://github.com/BVLC/caffe/tree/opencl) *see the OpenCL branch led by Fabian Tschopp*
-- [AWS AMI](https://github.com/bitfusionio/amis/tree/master/awsmrkt-bfboost-ubuntu14-cuda75-caffe) *pre-configured for AWS*
+- [AWS AMI](https://aws.amazon.com/marketplace/pp/B01M0AXXQB) *official deep learning amazon machine image from AWS*
 
 **Overview**:
 

From 25c217c1a49239bbcaf45aa6c60ab9f0354819bb Mon Sep 17 00:00:00 2001
From: linziyi <linziyi@sensetime.com>
Date: Fri, 19 Jan 2018 03:07:45 +0800
Subject: [PATCH 106/144] clear scratch use of loss bottom diffs

---
 src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu | 3 +++
 src/caffe/layers/softmax_loss_layer.cu               | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
index b9877e6a3f6..14e247f196d 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
@@ -69,6 +69,9 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
   caffe_gpu_asum(count, loss_data, &loss);
   normalizer_ = get_normalizer(normalization_, valid_count);
   top[0]->mutable_cpu_data()[0] = loss / normalizer_;
+
+  caffe_gpu_set(bottom[0]->count(), Dtype(0), bottom[0]->mutable_gpu_diff());
+  caffe_gpu_set(bottom[1]->count(), Dtype(0), bottom[1]->mutable_gpu_diff());
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu
index 660e1b39fe0..6c8db142f31 100644
--- a/src/caffe/layers/softmax_loss_layer.cu
+++ b/src/caffe/layers/softmax_loss_layer.cu
@@ -61,6 +61,8 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
   if (top.size() == 2) {
     top[1]->ShareData(prob_);
   }
+
+  caffe_gpu_set(bottom[0]->count(), Dtype(0), bottom[0]->mutable_gpu_diff());
 }
 
 template <typename Dtype>

From 12a70383088b30fbdb5b148540d743dd6d34f47c Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Sun, 28 Jan 2018 16:42:07 -0800
Subject: [PATCH 107/144] clear scratch use of accuracy bottom diff

---
 src/caffe/layers/accuracy_layer.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/caffe/layers/accuracy_layer.cu b/src/caffe/layers/accuracy_layer.cu
index a8cff936ccb..f6de50a7a7e 100644
--- a/src/caffe/layers/accuracy_layer.cu
+++ b/src/caffe/layers/accuracy_layer.cu
@@ -134,6 +134,7 @@ void AccuracyLayer<Dtype>::Forward_gpu(
       }
     }
   }
+  caffe_gpu_set(bottom[0]->count(), Dtype(0), bottom[0]->mutable_gpu_diff());
 }
 
 

From 41165906f9f1392563c5a23de7fedbb1b70293e2 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Sun, 28 Jan 2018 16:58:39 -0800
Subject: [PATCH 108/144] explain use of scratch diffs in comments

a few layers make use of otherwise unused diffs to accumulate results,
but unless the diffs are cleared in forward this contaminates the
gradients when these layers share a bottom and their backward is
skipped.
---
 src/caffe/layers/accuracy_layer.cu                   | 6 +++---
 src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu | 6 +++---
 src/caffe/layers/softmax_loss_layer.cu               | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/caffe/layers/accuracy_layer.cu b/src/caffe/layers/accuracy_layer.cu
index f6de50a7a7e..904aab422af 100644
--- a/src/caffe/layers/accuracy_layer.cu
+++ b/src/caffe/layers/accuracy_layer.cu
@@ -71,9 +71,8 @@ void AccuracyLayer<Dtype>::Forward_gpu(
   const int dim = bottom[0]->count() / outer_num_;
   const int num_labels = bottom[0]->shape(label_axis_);
   const int nthreads = outer_num_ * inner_num_;
-  // Since this memory is not used for anything,
-  // we use it here to avoid having to allocate new GPU
-  // memory to accumulate intermediate results in the kernel.
+  // Since this memory is not used for anything, we use it here to avoid having
+  // to allocate new GPU memory to accumulate intermediate results.
   Dtype* acc_data = bottom[0]->mutable_gpu_diff();
   if (top.size() == 1) {
     // simple case - report only global accuracy.
@@ -134,6 +133,7 @@ void AccuracyLayer<Dtype>::Forward_gpu(
       }
     }
   }
+  // Clear scratch memory to prevent interfering with backward (see #6202).
   caffe_gpu_set(bottom[0]->count(), Dtype(0), bottom[0]->mutable_gpu_diff());
 }
 
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
index 14e247f196d..7497e4aa47d 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
@@ -48,9 +48,8 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
   // Stable version of loss computation from input data
   const Dtype* input_data = bottom[0]->gpu_data();
   const Dtype* target = bottom[1]->gpu_data();
-  // Since this memory is not used for anything until it is overwritten
-  // on the backward pass, we use it here to avoid having to allocate new GPU
-  // memory to accumulate intermediate results in the kernel.
+  // Since this memory is not used for anything, we use it here to avoid having
+  // to allocate new GPU memory to accumulate intermediate results.
   Dtype* loss_data = bottom[0]->mutable_gpu_diff();
   Dtype* count_data = bottom[1]->mutable_gpu_diff();
   Dtype valid_count;
@@ -70,6 +69,7 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
   normalizer_ = get_normalizer(normalization_, valid_count);
   top[0]->mutable_cpu_data()[0] = loss / normalizer_;
 
+  // Clear scratch memory to prevent interfering with backward (see #6202).
   caffe_gpu_set(bottom[0]->count(), Dtype(0), bottom[0]->mutable_gpu_diff());
   caffe_gpu_set(bottom[1]->count(), Dtype(0), bottom[1]->mutable_gpu_diff());
 }
diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu
index 6c8db142f31..b3c8ffa6b6c 100644
--- a/src/caffe/layers/softmax_loss_layer.cu
+++ b/src/caffe/layers/softmax_loss_layer.cu
@@ -36,9 +36,8 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
   const Dtype* label = bottom[1]->gpu_data();
   const int dim = prob_.count() / outer_num_;
   const int nthreads = outer_num_ * inner_num_;
-  // Since this memory is not used for anything until it is overwritten
-  // on the backward pass, we use it here to avoid having to allocate new GPU
-  // memory to accumulate intermediate results in the kernel.
+  // Since this memory is not used for anything, we use it here to avoid having
+  // to allocate new GPU memory to accumulate intermediate results.
   Dtype* loss_data = bottom[0]->mutable_gpu_diff();
   // Similarly, this memory is never used elsewhere, and thus we can use it
   // to avoid having to allocate additional GPU memory.
@@ -62,6 +61,7 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
     top[1]->ShareData(prob_);
   }
 
+  // Clear scratch memory to prevent interfering with backward (see #6202).
   caffe_gpu_set(bottom[0]->count(), Dtype(0), bottom[0]->mutable_gpu_diff());
 }
 

From 88c96189bcbf3853b93e2b65c7b5e4948f9d5f67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Przemys=C5=82aw=20Dolata?= <snowball91b@gmail.com>
Date: Thu, 1 Feb 2018 14:43:44 +0100
Subject: [PATCH 109/144] corrected description of set_transpose in io.py

---
 python/caffe/io.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/caffe/io.py b/python/caffe/io.py
index ed4b3bef68b..d0103d60892 100644
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -186,13 +186,14 @@ def deprocess(self, in_, data):
 
     def set_transpose(self, in_, order):
         """
-        Set the input channel order for e.g. RGB to BGR conversion
-        as needed for the reference ImageNet model.
+        Set the order of dimensions, e.g. to convert OpenCV's HxWxC images
+        into CxHxW.
 
         Parameters
         ----------
-        in_ : which input to assign this channel order
+        in_ : which input to assign this dimension order
         order : the order to transpose the dimensions
+            for example (2,0,1) changes HxWxC into CxHxW and (1,2,0) reverts
         """
         self.__check_input(in_)
         if len(order) != len(self.inputs[in_]) - 1:

From 0050c207d5eb9b514d470ccb2125605f7adb0755 Mon Sep 17 00:00:00 2001
From: Matt Sanford <matt@mzsanford.com>
Date: Fri, 2 Feb 2018 13:23:23 -0800
Subject: [PATCH 110/144] Update Classifier and Detector to avoid deprecation
 warning

---
 python/caffe/classifier.py | 2 +-
 python/caffe/detector.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/caffe/classifier.py b/python/caffe/classifier.py
index 983760a786d..64d804be554 100644
--- a/python/caffe/classifier.py
+++ b/python/caffe/classifier.py
@@ -23,7 +23,7 @@ class Classifier(caffe.Net):
     def __init__(self, model_file, pretrained_file, image_dims=None,
                  mean=None, input_scale=None, raw_scale=None,
                  channel_swap=None):
-        caffe.Net.__init__(self, model_file, pretrained_file, caffe.TEST)
+        caffe.Net.__init__(self, model_file, caffe.TEST, weights=pretrained_file)
 
         # configure pre-processing
         in_ = self.inputs[0]
diff --git a/python/caffe/detector.py b/python/caffe/detector.py
index ef1f91730bf..ceee5d36f4c 100644
--- a/python/caffe/detector.py
+++ b/python/caffe/detector.py
@@ -35,7 +35,7 @@ class Detector(caffe.Net):
     def __init__(self, model_file, pretrained_file, mean=None,
                  input_scale=None, raw_scale=None, channel_swap=None,
                  context_pad=None):
-        caffe.Net.__init__(self, model_file, pretrained_file, caffe.TEST)
+        caffe.Net.__init__(self, model_file, caffe.TEST, weights=pretrained_file)
 
         # configure pre-processing
         in_ = self.inputs[0]

From 0582f84017ba55ab50d805e05744d5fcd57502de Mon Sep 17 00:00:00 2001
From: Yegor Bedarev <uhfband@gmail.com>
Date: Thu, 8 Feb 2018 18:05:21 +0700
Subject: [PATCH 111/144] Fix incorrect namespace for pycaffe submodule
 caffe_pb2 generated by protobuf

---
 Makefile             | 2 +-
 cmake/ProtoBuf.cmake | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index c85c695acff..c823f66ef10 100644
--- a/Makefile
+++ b/Makefile
@@ -641,7 +641,7 @@ $(PROTO_BUILD_DIR)/%.pb.cc $(PROTO_BUILD_DIR)/%.pb.h : \
 $(PY_PROTO_BUILD_DIR)/%_pb2.py : $(PROTO_SRC_DIR)/%.proto \
 		$(PY_PROTO_INIT) | $(PY_PROTO_BUILD_DIR)
 	@ echo PROTOC \(python\) $<
-	$(Q)protoc --proto_path=$(PROTO_SRC_DIR) --python_out=$(PY_PROTO_BUILD_DIR) $<
+	$(Q)protoc --proto_path=src --python_out=python $<
 
 $(PY_PROTO_INIT): | $(PY_PROTO_BUILD_DIR)
 	touch $(PY_PROTO_INIT)
diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake
index 8005b448707..72ea3230c50 100644
--- a/cmake/ProtoBuf.cmake
+++ b/cmake/ProtoBuf.cmake
@@ -78,7 +78,7 @@ function(caffe_protobuf_generate_cpp_py output_dir srcs_var hdrs_var python_var)
              "${output_dir}/${fil_we}_pb2.py"
       COMMAND ${CMAKE_COMMAND} -E make_directory "${output_dir}"
       COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --cpp_out    ${output_dir} ${_protoc_include} ${abs_fil}
-      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${output_dir} ${_protoc_include} ${abs_fil}
+      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJECT_BINARY_DIR}/include --proto_path ${PROJECT_SOURCE_DIR}/src ${_protoc_include} ${abs_fil}
       DEPENDS ${abs_fil}
       COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM )
   endforeach()

From c32629435a1e5dacc8a90a309d8b255d7b629379 Mon Sep 17 00:00:00 2001
From: iovodov <b@ovdv.ru>
Date: Mon, 18 Dec 2017 22:39:03 +0300
Subject: [PATCH 112/144] Weight parameter in solver is used in caffe.exe

Loading weights is moved from caffe.exe to solver class, so new "weights" solver parameter is used not only from command line but when caffe is used as library (including python)

corrected formatting

fixed line length

more formatting corrected
---
 src/caffe/proto/caffe.proto           | 12 +++++++++++-
 src/caffe/solver.cpp                  | 21 +++++++++++++++++++++
 src/caffe/test/test_upgrade_proto.cpp |  4 ++++
 tools/caffe.cpp                       | 23 +++++++----------------
 4 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index c96966b589d..4567861fb40 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -98,7 +98,7 @@ message NetParameter {
 // NOTE
 // Update the next available ID when you add a new SolverParameter field.
 //
-// SolverParameter next available ID: 42 (last added: layer_wise_reduce)
+// SolverParameter next available ID: 43 (last added: weights)
 message SolverParameter {
   //////////////////////////////////////////////////////////////////////////////
   // Specifying the train and test networks
@@ -241,6 +241,16 @@ message SolverParameter {
 
   // Overlap compute and communication for data parallel training
   optional bool layer_wise_reduce = 41 [default = true];
+
+  // Path to caffemodel file(s) with pretrained weights to initialize finetuning.
+  // Tha same as command line --weights parameter for caffe train command.
+  // If command line --weights parameter if specified, it has higher priority
+  // and owerwrites this one(s).
+  // If --snapshot command line parameter is specified, this one(s) are ignored.
+  // If several model files are expected, they can be listed in a one 
+  // weights parameter separated by ',' (like in a command string) or
+  // in repeated weights parameters separately.
+  repeated string weights = 42;
 }
 
 // A message that stores the solver snapshots
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 044269371ad..d229acff485 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -3,6 +3,7 @@
 #include <string>
 #include <vector>
 
+#include "boost/algorithm/string.hpp"
 #include "caffe/solver.hpp"
 #include "caffe/util/format.hpp"
 #include "caffe/util/hdf5.hpp"
@@ -59,6 +60,20 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
   current_step_ = 0;
 }
 
+// Load weights from the caffemodel(s) specified in "weights" solver parameter
+// into the train and test nets.
+template <typename Dtype>
+void LoadNetWeights(shared_ptr<Net<Dtype> > net,
+    const std::string& model_list) {
+  std::vector<std::string> model_names;
+  boost::split(model_names, model_list, boost::is_any_of(","));
+  for (int i = 0; i < model_names.size(); ++i) {
+    boost::trim(model_names[i]);
+    LOG(INFO) << "Finetuning from " << model_names[i];
+    net->CopyTrainedLayersFrom(model_names[i]);
+  }
+}
+
 template <typename Dtype>
 void Solver<Dtype>::InitTrainNet() {
   const int num_train_nets = param_.has_net() + param_.has_net_param() +
@@ -98,6 +113,9 @@ void Solver<Dtype>::InitTrainNet() {
   net_state.MergeFrom(param_.train_state());
   net_param.mutable_state()->CopyFrom(net_state);
   net_.reset(new Net<Dtype>(net_param));
+  for (int w_idx = 0; w_idx < param_.weights_size(); ++w_idx) {
+    LoadNetWeights(net_, param_.weights(w_idx));
+  }
 }
 
 template <typename Dtype>
@@ -173,6 +191,9 @@ void Solver<Dtype>::InitTestNets() {
         << "Creating test net (#" << i << ") specified by " << sources[i];
     test_nets_[i].reset(new Net<Dtype>(net_params[i]));
     test_nets_[i]->set_debug_info(param_.debug_info());
+    for (int w_idx = 0; w_idx < param_.weights_size(); ++w_idx) {
+      LoadNetWeights(test_nets_[i], param_.weights(w_idx));
+    }
   }
 }
 
diff --git a/src/caffe/test/test_upgrade_proto.cpp b/src/caffe/test/test_upgrade_proto.cpp
index 9dcc2aa55ec..769112ebfe7 100644
--- a/src/caffe/test/test_upgrade_proto.cpp
+++ b/src/caffe/test/test_upgrade_proto.cpp
@@ -2952,6 +2952,8 @@ TEST_F(SolverTypeUpgradeTest, TestSimple) {
   for (int i = 0; i < 6; ++i) {
     const string& input_proto =
         "net: 'examples/mnist/lenet_train_test.prototxt' "
+        "weights: 'examples/mnist/lenet_train_test1.caffemodel' "
+        "weights: 'examples/mnist/lenet_train_test2.caffemodel' "
         "test_iter: 100 "
         "test_interval: 500 "
         "base_lr: 0.01 "
@@ -2968,6 +2970,8 @@ TEST_F(SolverTypeUpgradeTest, TestSimple) {
         "solver_type: " + std::string(old_type_vec[i]) + " ";
     const string& expected_output_proto =
         "net: 'examples/mnist/lenet_train_test.prototxt' "
+        "weights: 'examples/mnist/lenet_train_test1.caffemodel' "
+        "weights: 'examples/mnist/lenet_train_test2.caffemodel' "
         "test_iter: 100 "
         "test_interval: 500 "
         "base_lr: 0.01 "
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index 3587d8aa1be..389cfb8a99e 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -146,20 +146,6 @@ int device_query() {
 }
 RegisterBrewFunction(device_query);
 
-// Load the weights from the specified caffemodel(s) into the train and
-// test nets.
-void CopyLayers(caffe::Solver<float>* solver, const std::string& model_list) {
-  std::vector<std::string> model_names;
-  boost::split(model_names, model_list, boost::is_any_of(",") );
-  for (int i = 0; i < model_names.size(); ++i) {
-    LOG(INFO) << "Finetuning from " << model_names[i];
-    solver->net()->CopyTrainedLayersFrom(model_names[i]);
-    for (int j = 0; j < solver->test_nets().size(); ++j) {
-      solver->test_nets()[j]->CopyTrainedLayersFrom(model_names[i]);
-    }
-  }
-}
-
 // Translate the signal effect the user specified on the command-line to the
 // corresponding enumeration.
 caffe::SolverAction::Enum GetRequestedAction(
@@ -233,6 +219,13 @@ int train() {
         GetRequestedAction(FLAGS_sigint_effect),
         GetRequestedAction(FLAGS_sighup_effect));
 
+  if (FLAGS_snapshot.size()) {
+    solver_param.clear_weights();
+  } else if (FLAGS_weights.size()) {
+    solver_param.clear_weights();
+    solver_param.add_weights(FLAGS_weights);
+  }
+
   shared_ptr<caffe::Solver<float> >
       solver(caffe::SolverRegistry<float>::CreateSolver(solver_param));
 
@@ -241,8 +234,6 @@ int train() {
   if (FLAGS_snapshot.size()) {
     LOG(INFO) << "Resuming from " << FLAGS_snapshot;
     solver->Restore(FLAGS_snapshot.c_str());
-  } else if (FLAGS_weights.size()) {
-    CopyLayers(solver.get(), FLAGS_weights);
   }
 
   LOG(INFO) << "Starting Optimization";

From 6fa4c62dcca954b7f8ae26e7f7314e235dd6a3b4 Mon Sep 17 00:00:00 2001
From: iovodov <b@ovdv.ru>
Date: Sat, 13 Jan 2018 17:19:45 +0300
Subject: [PATCH 113/144] Automatic replacement of snapshot_prefix parameter if
 it is empty or points to a directory. See issue #6110 proposed improvement
 No.2

---
 src/caffe/proto/caffe.proto      |  6 +++++-
 src/caffe/util/upgrade_proto.cpp | 21 +++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 4567861fb40..22764abc33f 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -186,7 +186,11 @@ message SolverParameter {
   optional float clip_gradients = 35 [default = -1];
 
   optional int32 snapshot = 14 [default = 0]; // The snapshot interval
-  optional string snapshot_prefix = 15; // The prefix for the snapshot.
+  // The prefix for the snapshot.
+  // If not set then is replaced by prototxt file path without extention.
+  // If is set to directory then is augmented by prototxt file name
+  // without extention.
+  optional string snapshot_prefix = 15;
   // whether to snapshot diff in the results or not. Snapshotting diff will help
   // debugging but the final protocol buffer size will be much larger.
   optional bool snapshot_diff = 16 [default = false];
diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp
index 94771c8c050..ad40b73d295 100644
--- a/src/caffe/util/upgrade_proto.cpp
+++ b/src/caffe/util/upgrade_proto.cpp
@@ -2,6 +2,8 @@
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/text_format.h>
 
+#include <boost/filesystem.hpp>
+
 #include <map>
 #include <string>
 
@@ -1095,12 +1097,31 @@ bool UpgradeSolverAsNeeded(const string& param_file, SolverParameter* param) {
   return success;
 }
 
+// Replaces snapshot_prefix of SolverParameter if it is not specified
+// or is set to directory
+void UpgradeSnapshotPrefixProperty(const string& param_file,
+                                   SolverParameter* param) {
+  using boost::filesystem::path;
+  using boost::filesystem::is_directory;
+  if (!param->has_snapshot_prefix()) {
+    param->set_snapshot_prefix(path(param_file).replace_extension().string());
+    LOG(INFO) << "snapshot_prefix was not specified and is set to "
+                + param->snapshot_prefix();
+  } else if (is_directory(param->snapshot_prefix())) {
+    param->set_snapshot_prefix((path(param->snapshot_prefix()) /
+                               path(param_file).stem()).string());
+    LOG(INFO) << "snapshot_prefix was a directory and is replaced to "
+                + param->snapshot_prefix();
+  }
+}
+
 // Read parameters from a file into a SolverParameter proto message.
 void ReadSolverParamsFromTextFileOrDie(const string& param_file,
                                        SolverParameter* param) {
   CHECK(ReadProtoFromTextFile(param_file, param))
       << "Failed to parse SolverParameter file: " << param_file;
   UpgradeSolverAsNeeded(param_file, param);
+  UpgradeSnapshotPrefixProperty(param_file, param);
 }
 
 }  // namespace caffe

From 37e4289024d80632e2c721e865c14be63aab9d8e Mon Sep 17 00:00:00 2001
From: jasjuang <jasjuang@gmail.com>
Date: Tue, 13 Feb 2018 16:57:18 -0800
Subject: [PATCH 114/144] fix cuda 9.1 compilation

---
 cmake/Cuda.cmake | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index 54e26fd5c8e..9325674a6a0 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -109,6 +109,12 @@ function(caffe_select_nvcc_arch_flags out_variable)
   set(__nvcc_flags "")
   set(__nvcc_archs_readable "")
 
+  string(COMPARE GREATER_EQUAL "${CUDA_VERSION}" "9.1" iscudanewerthan91)
+  if(iscudanewerthan91)
+    string(REPLACE "21(20)" "" __cuda_arch_bin "${__cuda_arch_bin}")
+    string(REPLACE "20" "" __cuda_arch_bin "${__cuda_arch_bin}")
+  endif()
+
   # Tell NVCC to add binaries for the specified GPUs
   foreach(__arch ${__cuda_arch_bin})
     if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)")

From b3deb95e75f6890c08e7aa60193ea304614b24ea Mon Sep 17 00:00:00 2001
From: yanchen036 <yanchen036@gmail.com>
Date: Mon, 2 Jan 2017 13:33:49 +0800
Subject: [PATCH 115/144] bug fix: ext should not include the '.'

---
 src/caffe/util/io.cpp      | 2 +-
 tools/convert_imageset.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index 835d2d4e4ff..5295d9dddb9 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -106,7 +106,7 @@ cv::Mat ReadImageToCVMat(const string& filename) {
 static bool matchExt(const std::string & fn,
                      std::string en) {
   size_t p = fn.rfind('.');
-  std::string ext = p != fn.npos ? fn.substr(p) : fn;
+  std::string ext = p != fn.npos ? fn.substr(p+1) : fn;
   std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
   std::transform(en.begin(), en.end(), en.begin(), ::tolower);
   if ( ext == en )
diff --git a/tools/convert_imageset.cpp b/tools/convert_imageset.cpp
index 90cdb15d427..9c5d09f9bef 100644
--- a/tools/convert_imageset.cpp
+++ b/tools/convert_imageset.cpp
@@ -115,7 +115,7 @@ int main(int argc, char** argv) {
       size_t p = fn.rfind('.');
       if ( p == fn.npos )
         LOG(WARNING) << "Failed to guess the encoding of '" << fn << "'";
-      enc = fn.substr(p);
+      enc = fn.substr(p+1);
       std::transform(enc.begin(), enc.end(), enc.begin(), ::tolower);
     }
     status = ReadImageToDatum(root_folder + lines[line_id].first,

From ced55b009ae4fd6c0685543a013b1439da5879ba Mon Sep 17 00:00:00 2001
From: knsong <sunskn@163.com>
Date: Sat, 17 Feb 2018 15:56:32 +0800
Subject: [PATCH 116/144] Fix compatibility for ND convolution

---
 include/caffe/filler.hpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index bb92ded780f..e3e86a52633 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -108,9 +108,9 @@ class PositiveUnitballFiller : public Filler<Dtype> {
     caffe_rng_uniform<Dtype>(blob->count(), 0, 1, blob->mutable_cpu_data());
     // We expect the filler to not be called very frequently, so we will
     // just use a simple implementation
-    int dim = blob->count() / blob->num();
+    int dim = blob->count() / blob->shape(0);
     CHECK(dim);
-    for (int i = 0; i < blob->num(); ++i) {
+    for (int i = 0; i < blob->shape(0); ++i) {
       Dtype sum = 0;
       for (int j = 0; j < dim; ++j) {
         sum += data[i * dim + j];
@@ -147,8 +147,9 @@ class XavierFiller : public Filler<Dtype> {
       : Filler<Dtype>(param) {}
   virtual void Fill(Blob<Dtype>* blob) {
     CHECK(blob->count());
-    int fan_in = blob->count() / blob->num();
-    int fan_out = blob->count() / blob->channels();
+    int fan_in = blob->count() / blob->shape(0);
+    // Compatible for ND Convolution
+    int fan_out = blob->count() / blob->shape(1);
     Dtype n = fan_in;  // default to fan_in
     if (this->filler_param_.variance_norm() ==
         FillerParameter_VarianceNorm_AVERAGE) {
@@ -189,8 +190,9 @@ class MSRAFiller : public Filler<Dtype> {
       : Filler<Dtype>(param) {}
   virtual void Fill(Blob<Dtype>* blob) {
     CHECK(blob->count());
-    int fan_in = blob->count() / blob->num();
-    int fan_out = blob->count() / blob->channels();
+    int fan_in = blob->count() / blob->shape(0);
+    // Compatible for ND Convolution
+    int fan_out = blob->count() / blob->shape(1);
     Dtype n = fan_in;  // default to fan_in
     if (this->filler_param_.variance_norm() ==
         FillerParameter_VarianceNorm_AVERAGE) {

From 8e09610f49e8af8a161b3996e6406d0fd101e0e3 Mon Sep 17 00:00:00 2001
From: Noiredd <snowball91b@gmail.com>
Date: Fri, 23 Feb 2018 14:45:59 +0100
Subject: [PATCH 117/144] Remove legacy tools

---
 tools/device_query.cpp        | 7 -------
 tools/finetune_net.cpp        | 7 -------
 tools/net_speed_benchmark.cpp | 7 -------
 tools/test_net.cpp            | 7 -------
 tools/train_net.cpp           | 7 -------
 5 files changed, 35 deletions(-)
 delete mode 100644 tools/device_query.cpp
 delete mode 100644 tools/finetune_net.cpp
 delete mode 100644 tools/net_speed_benchmark.cpp
 delete mode 100644 tools/test_net.cpp
 delete mode 100644 tools/train_net.cpp

diff --git a/tools/device_query.cpp b/tools/device_query.cpp
deleted file mode 100644
index 03799e52b53..00000000000
--- a/tools/device_query.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "caffe/common.hpp"
-
-int main(int argc, char** argv) {
-  LOG(FATAL) << "Deprecated. Use caffe device_query "
-                "[--device_id=0] instead.";
-  return 0;
-}
diff --git a/tools/finetune_net.cpp b/tools/finetune_net.cpp
deleted file mode 100644
index 81c0c354dbf..00000000000
--- a/tools/finetune_net.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "caffe/caffe.hpp"
-
-int main(int argc, char** argv) {
-  LOG(FATAL) << "Deprecated. Use caffe train --solver=... "
-                "[--weights=...] instead.";
-  return 0;
-}
diff --git a/tools/net_speed_benchmark.cpp b/tools/net_speed_benchmark.cpp
deleted file mode 100644
index cd16e8d0984..00000000000
--- a/tools/net_speed_benchmark.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "caffe/caffe.hpp"
-
-int main(int argc, char** argv) {
-  LOG(FATAL) << "Deprecated. Use caffe time --model=... "
-             "[--iterations=50] [--gpu] [--device_id=0]";
-  return 0;
-}
diff --git a/tools/test_net.cpp b/tools/test_net.cpp
deleted file mode 100644
index 92e14eeebaf..00000000000
--- a/tools/test_net.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "caffe/caffe.hpp"
-
-int main(int argc, char** argv) {
-  LOG(FATAL) << "Deprecated. Use caffe test --model=... "
-      "--weights=... instead.";
-  return 0;
-}
diff --git a/tools/train_net.cpp b/tools/train_net.cpp
deleted file mode 100644
index 622bca311c8..00000000000
--- a/tools/train_net.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "caffe/caffe.hpp"
-
-int main(int argc, char** argv) {
-  LOG(FATAL) << "Deprecated. Use caffe train --solver=... "
-                "[--snapshot=...] instead.";
-  return 0;
-}

From 412f18dcf204fc054b09dbfdda73b99d903149f4 Mon Sep 17 00:00:00 2001
From: Noiredd <snowball91b@gmail.com>
Date: Tue, 6 Mar 2018 13:39:49 +0100
Subject: [PATCH 118/144] 1D blob handling in MSRA/Xavier fillers

---
 include/caffe/filler.hpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index e3e86a52633..a44773619dc 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -148,8 +148,10 @@ class XavierFiller : public Filler<Dtype> {
   virtual void Fill(Blob<Dtype>* blob) {
     CHECK(blob->count());
     int fan_in = blob->count() / blob->shape(0);
-    // Compatible for ND Convolution
-    int fan_out = blob->count() / blob->shape(1);
+    // Compatibility with ND blobs
+    int fan_out = blob->num_axes() > 1 ?
+                  blob->count() / blob->shape(1) :
+                  blob->count();
     Dtype n = fan_in;  // default to fan_in
     if (this->filler_param_.variance_norm() ==
         FillerParameter_VarianceNorm_AVERAGE) {
@@ -191,8 +193,10 @@ class MSRAFiller : public Filler<Dtype> {
   virtual void Fill(Blob<Dtype>* blob) {
     CHECK(blob->count());
     int fan_in = blob->count() / blob->shape(0);
-    // Compatible for ND Convolution
-    int fan_out = blob->count() / blob->shape(1);
+    // Compatibility with ND blobs
+    int fan_out = blob->num_axes() > 1 ?
+                  blob->count() / blob->shape(1) :
+                  blob->count();
     Dtype n = fan_in;  // default to fan_in
     if (this->filler_param_.variance_norm() ==
         FillerParameter_VarianceNorm_AVERAGE) {

From 7b3ac40be38dd2ace67ed0ff481ee113a0f07211 Mon Sep 17 00:00:00 2001
From: nic25 <nicola.corradi@gmail.com>
Date: Wed, 7 Mar 2018 03:40:52 -0800
Subject: [PATCH 119/144] Add lr_mult label to the network graph in draw_net.py
 (#6273)

draw_net.py refactoring and optional LR visualization

* refactoring `get_layer_label`

rewrote the function body to make it more streamlined.
does not affect inputs and outputs

* optionally visualize LR when drawing the network

adds an option to `python/draw_net.py` that allows to visualize information
about the learning rate multiplier (if relevant) when drawing the network's
graph.
---
 python/caffe/draw.py | 144 ++++++++++++++++++++++++++++++++-----------
 python/draw_net.py   |   6 +-
 2 files changed, 112 insertions(+), 38 deletions(-)

diff --git a/python/caffe/draw.py b/python/caffe/draw.py
index 8411a41d1d4..0061f490bd9 100644
--- a/python/caffe/draw.py
+++ b/python/caffe/draw.py
@@ -59,18 +59,60 @@ def get_edge_label(layer):
     return edge_label
 
 
-def get_layer_label(layer, rankdir):
+def get_layer_lr_mult(layer):
+    """Get the learning rate multipliers.
+
+    Get the learning rate multipliers for the given layer. Assumes a
+    Convolution/Deconvolution/InnerProduct layer.
+
+    Parameters
+    ----------
+    layer : caffe_pb2.LayerParameter
+        A Convolution, Deconvolution, or InnerProduct layer.
+
+    Returns
+    -------
+    learning_rates : tuple of floats
+        the learning rate multipliers for the weights and biases.
+    """
+    if layer.type not in ['Convolution', 'Deconvolution', 'InnerProduct']:
+        raise ValueError("%s layers do not have a "
+                         "learning rate multiplier" % layer.type)
+
+    if not hasattr(layer, 'param'):
+        return (1.0, 1.0)
+
+    params = getattr(layer, 'param')
+
+    if len(params) == 0:
+        return (1.0, 1.0)
+
+    if len(params) == 1:
+        lrm0 = getattr(params[0],'lr_mult', 1.0)
+        return (lrm0, 1.0)
+
+    if len(params) == 2:
+        lrm0, lrm1 = [getattr(p,'lr_mult', 1.0) for p in params]
+        return (lrm0, lrm1)
+
+    raise ValueError("Could not parse the learning rate multiplier")
+
+
+def get_layer_label(layer, rankdir, display_lrm=False):
     """Define node label based on layer type.
 
     Parameters
     ----------
-    layer : ?
+    layer : caffe_pb2.LayerParameter
     rankdir : {'LR', 'TB', 'BT'}
         Direction of graph layout.
+    display_lrm : boolean, optional
+        If True include the learning rate multipliers in the label (default is
+        False).
 
     Returns
     -------
-    string :
+    node_label : string
         A label for the current layer
     """
 
@@ -81,36 +123,54 @@ def get_layer_label(layer, rankdir):
     else:
         # If graph orientation is horizontal, vertical space is free and
         # horizontal space is not; separate words with newlines
-        separator = '\\n'
-
-    if layer.type == 'Convolution' or layer.type == 'Deconvolution':
-        # Outer double quotes needed or else colon characters don't parse
-        # properly
-        node_label = '"%s%s(%s)%skernel size: %d%sstride: %d%spad: %d"' %\
-                     (layer.name,
-                      separator,
-                      layer.type,
-                      separator,
-                      layer.convolution_param.kernel_size[0] if len(layer.convolution_param.kernel_size) else 1,
-                      separator,
-                      layer.convolution_param.stride[0] if len(layer.convolution_param.stride) else 1,
-                      separator,
-                      layer.convolution_param.pad[0] if len(layer.convolution_param.pad) else 0)
-    elif layer.type == 'Pooling':
+        separator = r'\n'
+
+    # Initializes a list of descriptors that will be concatenated into the
+    # `node_label`
+    descriptors_list = []
+    # Add the layer's name
+    descriptors_list.append(layer.name)
+    # Add layer's type
+    if layer.type == 'Pooling':
         pooling_types_dict = get_pooling_types_dict()
-        node_label = '"%s%s(%s %s)%skernel size: %d%sstride: %d%spad: %d"' %\
-                     (layer.name,
-                      separator,
-                      pooling_types_dict[layer.pooling_param.pool],
-                      layer.type,
-                      separator,
-                      layer.pooling_param.kernel_size,
-                      separator,
-                      layer.pooling_param.stride,
-                      separator,
-                      layer.pooling_param.pad)
+        layer_type = '(%s %s)' % (layer.type,
+                                  pooling_types_dict[layer.pooling_param.pool])
     else:
-        node_label = '"%s%s(%s)"' % (layer.name, separator, layer.type)
+        layer_type = '(%s)' % layer.type
+    descriptors_list.append(layer_type)
+
+    # Describe parameters for spatial operation layers
+    if layer.type in ['Convolution', 'Deconvolution', 'Pooling']:
+        if layer.type == 'Pooling':
+            kernel_size = layer.pooling_param.kernel_size
+            stride = layer.pooling_param.stride
+            padding = layer.pooling_param.pad
+        else:
+            kernel_size = layer.convolution_param.kernel_size[0] if \
+                len(layer.convolution_param.kernel_size) else 1
+            stride = layer.convolution_param.stride[0] if \
+                len(layer.convolution_param.stride) else 1
+            padding = layer.convolution_param.pad[0] if \
+                len(layer.convolution_param.pad) else 0
+        spatial_descriptor = separator.join([
+            "kernel size: %d" % kernel_size,
+            "stride: %d" % stride,
+            "pad: %d" % padding,
+        ])
+        descriptors_list.append(spatial_descriptor)
+
+    # Add LR multiplier for learning layers
+    if display_lrm and layer.type in ['Convolution', 'Deconvolution', 'InnerProduct']:
+        lrm0, lrm1 = get_layer_lr_mult(layer)
+        if any([lrm0, lrm1]):
+            lr_mult = "lr mult: %.1f, %.1f" % (lrm0, lrm1)
+            descriptors_list.append(lr_mult)
+
+    # Concatenate the descriptors into one label
+    node_label = separator.join(descriptors_list)
+    # Outer double quotes needed or else colon characters don't parse
+    # properly
+    node_label = '"%s"' % node_label
     return node_label
 
 
@@ -127,7 +187,7 @@ def choose_color_by_layertype(layertype):
     return color
 
 
-def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None):
+def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None, display_lrm=False):
     """Create a data structure which represents the `caffe_net`.
 
     Parameters
@@ -140,6 +200,9 @@ def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None):
     phase : {caffe_pb2.Phase.TRAIN, caffe_pb2.Phase.TEST, None} optional
         Include layers from this network phase.  If None, include all layers.
         (the default is None)
+    display_lrm : boolean, optional
+        If True display the learning rate multipliers when relevant (default is
+        False).
 
     Returns
     -------
@@ -164,7 +227,7 @@ def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None):
             included = included and not layer_phase.phase == phase
           if not included:
             continue
-        node_label = get_layer_label(layer, rankdir)
+        node_label = get_layer_label(layer, rankdir, display_lrm=display_lrm)
         node_name = "%s_%s" % (layer.name, layer.type)
         if (len(layer.bottom) == 1 and len(layer.top) == 1 and
            layer.bottom[0] == layer.top[0]):
@@ -202,7 +265,7 @@ def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None):
     return pydot_graph
 
 
-def draw_net(caffe_net, rankdir, ext='png', phase=None):
+def draw_net(caffe_net, rankdir, ext='png', phase=None, display_lrm=False):
     """Draws a caffe net and returns the image string encoded using the given
     extension.
 
@@ -214,16 +277,20 @@ def draw_net(caffe_net, rankdir, ext='png', phase=None):
     phase : {caffe_pb2.Phase.TRAIN, caffe_pb2.Phase.TEST, None} optional
         Include layers from this network phase.  If None, include all layers.
         (the default is None)
+    display_lrm : boolean, optional
+        If True display the learning rate multipliers for the learning layers
+        (default is False).
 
     Returns
     -------
     string :
         Postscript representation of the graph.
     """
-    return get_pydot_graph(caffe_net, rankdir, phase=phase).create(format=ext)
+    return get_pydot_graph(caffe_net, rankdir, phase=phase,
+                           display_lrm=display_lrm).create(format=ext)
 
 
-def draw_net_to_file(caffe_net, filename, rankdir='LR', phase=None):
+def draw_net_to_file(caffe_net, filename, rankdir='LR', phase=None, display_lrm=False):
     """Draws a caffe net, and saves it to file using the format given as the
     file extension. Use '.raw' to output raw text that you can manually feed
     to graphviz to draw graphs.
@@ -238,7 +305,10 @@ def draw_net_to_file(caffe_net, filename, rankdir='LR', phase=None):
     phase : {caffe_pb2.Phase.TRAIN, caffe_pb2.Phase.TEST, None} optional
         Include layers from this network phase.  If None, include all layers.
         (the default is None)
+    display_lrm : boolean, optional
+        If True display the learning rate multipliers for the learning layers
+        (default is False).
     """
     ext = filename[filename.rfind('.')+1:]
     with open(filename, 'wb') as fid:
-        fid.write(draw_net(caffe_net, rankdir, ext, phase))
+        fid.write(draw_net(caffe_net, rankdir, ext, phase, display_lrm))
diff --git a/python/draw_net.py b/python/draw_net.py
index dfe70d26a71..23cae30aef2 100755
--- a/python/draw_net.py
+++ b/python/draw_net.py
@@ -33,6 +33,10 @@ def parse_args():
                               'TEST, or ALL.  If ALL, then all layers are drawn '
                               'regardless of phase.'),
                         default="ALL")
+    parser.add_argument('--display_lrm', action='store_true',
+                        help=('Use this flag to visualize the learning rate '
+                              'multiplier, when non-zero, for the learning '
+                              'layers (Convolution, Deconvolution, InnerProduct).'))
 
     args = parser.parse_args()
     return args
@@ -51,7 +55,7 @@ def main():
     elif args.phase != "ALL":
         raise ValueError("Unknown phase: " + args.phase)
     caffe.draw.draw_net_to_file(net, args.output_image_file, args.rankdir,
-                                phase)
+                                phase, args.display_lrm)
 
 
 if __name__ == '__main__':

From d7da0924486f762d810e78ad55be5206c3e5198e Mon Sep 17 00:00:00 2001
From: Noiredd <snowball91b@gmail.com>
Date: Wed, 7 Mar 2018 13:57:56 +0100
Subject: [PATCH 120/144] PoolingLayer customizable output shape rounding mode

---
 include/caffe/layers/pooling_layer.hpp |  1 +
 src/caffe/layers/pooling_layer.cpp     | 21 +++++++++++++++++----
 src/caffe/proto/caffe.proto            |  6 ++++++
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/include/caffe/layers/pooling_layer.hpp b/include/caffe/layers/pooling_layer.hpp
index f4d6803ba8e..38a432832cf 100644
--- a/include/caffe/layers/pooling_layer.hpp
+++ b/include/caffe/layers/pooling_layer.hpp
@@ -51,6 +51,7 @@ class PoolingLayer : public Layer<Dtype> {
   int height_, width_;
   int pooled_height_, pooled_width_;
   bool global_pooling_;
+  PoolingParameter_RoundMode round_mode_;
   Blob<Dtype> rand_idx_;
   Blob<int> max_idx_;
 };
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index 90897db0f45..f2a0885771f 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -35,6 +35,7 @@ void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       || (!pool_param.has_stride_h() && !pool_param.has_stride_w()))
       << "Stride is stride OR stride_h and stride_w are required.";
   global_pooling_ = pool_param.global_pooling();
+  round_mode_ = pool_param.round_mode();
   if (global_pooling_) {
     kernel_h_ = bottom[0]->height();
     kernel_w_ = bottom[0]->width();
@@ -87,10 +88,22 @@ void PoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     kernel_h_ = bottom[0]->height();
     kernel_w_ = bottom[0]->width();
   }
-  pooled_height_ = static_cast<int>(ceil(static_cast<float>(
-      height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
-  pooled_width_ = static_cast<int>(ceil(static_cast<float>(
-      width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
+  switch (round_mode_) {
+  case PoolingParameter_RoundMode_CEIL:
+    pooled_height_ = static_cast<int>(ceil(static_cast<float>(
+        height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
+    pooled_width_ = static_cast<int>(ceil(static_cast<float>(
+        width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
+    break;
+  case PoolingParameter_RoundMode_FLOOR:
+    pooled_height_ = static_cast<int>(floor(static_cast<float>(
+        height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
+    pooled_width_ = static_cast<int>(floor(static_cast<float>(
+        width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
+    break;
+  default:
+    LOG(FATAL) << "Unknown rounding mode.";
+  }
   if (pad_h_ || pad_w_) {
     // If we have padding, ensure that the last pooling starts strictly
     // inside the image (instead of at the padding); otherwise clip the last.
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 22764abc33f..cfef3c00262 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -935,6 +935,12 @@ message PoolingParameter {
   // If global_pooling then it will pool over the size of the bottom by doing
   // kernel_h = bottom->height and kernel_w = bottom->width
   optional bool global_pooling = 12 [default = false];
+  // How to calculate the output size - using ceil (default) or floor rounding.
+  enum RoundMode {
+    CEIL = 0;
+    FLOOR = 1;
+  }
+  optional RoundMode round_mode = 13 [default = CEIL];
 }
 
 message PowerParameter {

From 527a1c1c532cf4ef6b4a8b16ff17c7edd3467234 Mon Sep 17 00:00:00 2001
From: Noiredd <snowball91b@gmail.com>
Date: Thu, 8 Mar 2018 14:02:10 +0100
Subject: [PATCH 121/144] Filler testing overhaul

---
 src/caffe/test/test_filler.cpp | 403 +++++++++++++++++++++++++++------
 1 file changed, 331 insertions(+), 72 deletions(-)

diff --git a/src/caffe/test/test_filler.cpp b/src/caffe/test/test_filler.cpp
index f84d707baa0..3ecec37aa03 100644
--- a/src/caffe/test/test_filler.cpp
+++ b/src/caffe/test/test_filler.cpp
@@ -1,3 +1,5 @@
+#include <vector>
+
 #include "gtest/gtest.h"
 
 #include "caffe/filler.hpp"
@@ -10,11 +12,20 @@ template <typename Dtype>
 class ConstantFillerTest : public ::testing::Test {
  protected:
   ConstantFillerTest()
-      : blob_(new Blob<Dtype>(2, 3, 4, 5)),
+      : blob_(new Blob<Dtype>()),
         filler_param_() {
     filler_param_.set_value(10.);
     filler_.reset(new ConstantFiller<Dtype>(filler_param_));
+  }
+  virtual void test_params(const vector<int>& shape) {
+    EXPECT_TRUE(blob_);
+    blob_->Reshape(shape);
     filler_->Fill(blob_);
+    const int count = blob_->count();
+    const Dtype* data = blob_->cpu_data();
+    for (int i = 0; i < count; ++i) {
+      EXPECT_EQ(data[i], filler_param_.value());
+    }
   }
   virtual ~ConstantFillerTest() { delete blob_; }
   Blob<Dtype>* const blob_;
@@ -25,12 +36,34 @@ class ConstantFillerTest : public ::testing::Test {
 TYPED_TEST_CASE(ConstantFillerTest, TestDtypes);
 
 TYPED_TEST(ConstantFillerTest, TestFill) {
-  EXPECT_TRUE(this->blob_);
-  const int count = this->blob_->count();
-  const TypeParam* data = this->blob_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_EQ(data[i], this->filler_param_.value());
-  }
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(ConstantFillerTest, TestFill1D) {
+  vector<int> blob_shape(1, 15);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(ConstantFillerTest, TestFill2D) {
+  vector<int> blob_shape;
+  blob_shape.push_back(8);
+  blob_shape.push_back(3);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(ConstantFillerTest, TestFill5D) {
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  blob_shape.push_back(2);
+  this->test_params(blob_shape);
 }
 
 
@@ -38,12 +71,22 @@ template <typename Dtype>
 class UniformFillerTest : public ::testing::Test {
  protected:
   UniformFillerTest()
-      : blob_(new Blob<Dtype>(2, 3, 4, 5)),
+      : blob_(new Blob<Dtype>()),
         filler_param_() {
     filler_param_.set_min(1.);
     filler_param_.set_max(2.);
     filler_.reset(new UniformFiller<Dtype>(filler_param_));
+  }
+  virtual void test_params(const vector<int>& shape) {
+    EXPECT_TRUE(blob_);
+    blob_->Reshape(shape);
     filler_->Fill(blob_);
+    const int count = blob_->count();
+    const Dtype* data = blob_->cpu_data();
+    for (int i = 0; i < count; ++i) {
+      EXPECT_GE(data[i], filler_param_.min());
+      EXPECT_LE(data[i], filler_param_.max());
+    }
   }
   virtual ~UniformFillerTest() { delete blob_; }
   Blob<Dtype>* const blob_;
@@ -54,23 +97,64 @@ class UniformFillerTest : public ::testing::Test {
 TYPED_TEST_CASE(UniformFillerTest, TestDtypes);
 
 TYPED_TEST(UniformFillerTest, TestFill) {
-  EXPECT_TRUE(this->blob_);
-  const int count = this->blob_->count();
-  const TypeParam* data = this->blob_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_GE(data[i], this->filler_param_.min());
-    EXPECT_LE(data[i], this->filler_param_.max());
-  }
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(UniformFillerTest, TestFill1D) {
+  vector<int> blob_shape(1, 15);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(UniformFillerTest, TestFill2D) {
+  vector<int> blob_shape;
+  blob_shape.push_back(8);
+  blob_shape.push_back(3);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(UniformFillerTest, TestFill5D) {
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  blob_shape.push_back(2);
+  this->test_params(blob_shape);
 }
 
 template <typename Dtype>
 class PositiveUnitballFillerTest : public ::testing::Test {
  protected:
   PositiveUnitballFillerTest()
-      : blob_(new Blob<Dtype>(2, 3, 4, 5)),
+      : blob_(new Blob<Dtype>()),
         filler_param_() {
     filler_.reset(new PositiveUnitballFiller<Dtype>(filler_param_));
+  }
+  virtual void test_params(const vector<int>& shape) {
+    EXPECT_TRUE(blob_);
+    blob_->Reshape(shape);
     filler_->Fill(blob_);
+    const int num = blob_->shape(0);
+    const int count = blob_->count();
+    const int dim = count / num;
+    const Dtype* data = blob_->cpu_data();
+    for (int i = 0; i < count; ++i) {
+      EXPECT_GE(data[i], 0);
+      EXPECT_LE(data[i], 1);
+    }
+    for (int i = 0; i < num; ++i) {
+      Dtype sum = Dtype(0);
+      for (int j = 0; j < dim; ++j) {
+        sum += data[i * dim + j];
+      }
+      EXPECT_GE(sum, 0.999);
+      EXPECT_LE(sum, 1.001);
+    }
   }
   virtual ~PositiveUnitballFillerTest() { delete blob_; }
   Blob<Dtype>* const blob_;
@@ -81,35 +165,78 @@ class PositiveUnitballFillerTest : public ::testing::Test {
 TYPED_TEST_CASE(PositiveUnitballFillerTest, TestDtypes);
 
 TYPED_TEST(PositiveUnitballFillerTest, TestFill) {
-  EXPECT_TRUE(this->blob_);
-  const int num = this->blob_->num();
-  const int count = this->blob_->count();
-  const int dim = count / num;
-  const TypeParam* data = this->blob_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_GE(data[i], 0);
-    EXPECT_LE(data[i], 1);
-  }
-  for (int i = 0; i < num; ++i) {
-    TypeParam sum = 0;
-    for (int j = 0; j < dim; ++j) {
-      sum += data[i * dim + j];
-    }
-    EXPECT_GE(sum, 0.999);
-    EXPECT_LE(sum, 1.001);
-  }
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(PositiveUnitballFillerTest, TestFill1D) {
+  vector<int> blob_shape(1, 15);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(PositiveUnitballFillerTest, TestFill2D) {
+  vector<int> blob_shape;
+  blob_shape.push_back(8);
+  blob_shape.push_back(3);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(PositiveUnitballFillerTest, TestFill5D) {
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  blob_shape.push_back(2);
+  this->test_params(blob_shape);
 }
 
 template <typename Dtype>
 class GaussianFillerTest : public ::testing::Test {
  protected:
   GaussianFillerTest()
-      : blob_(new Blob<Dtype>(2, 3, 4, 5)),
+      : blob_(new Blob<Dtype>()),
         filler_param_() {
     filler_param_.set_mean(10.);
     filler_param_.set_std(0.1);
     filler_.reset(new GaussianFiller<Dtype>(filler_param_));
+  }
+  virtual void test_params(const vector<int>& shape,
+      const Dtype tolerance = Dtype(5), const int repetitions = 100) {
+    // Tests for statistical properties should be ran multiple times.
+    EXPECT_TRUE(blob_);
+    blob_->Reshape(shape);
+    for (int i = 0; i < repetitions; ++i) {
+      test_params_iter(shape, tolerance);
+    }
+  }
+  virtual void test_params_iter(const vector<int>& shape,
+      const Dtype tolerance) {
+    // This test has a configurable tolerance parameter - by default it was
+    // equal to 5.0 which is very loose - allowing some tuning (e.g. for tests
+    // on smaller blobs the actual variance will be larger than desired, so the
+    // tolerance can be increased to account for that).
     filler_->Fill(blob_);
+    const int count = blob_->count();
+    const Dtype* data = blob_->cpu_data();
+    Dtype mean = Dtype(0);
+    Dtype var = Dtype(0);
+    for (int i = 0; i < count; ++i) {
+      mean += data[i];
+      var += data[i] * data[i];
+    }
+    mean /= count;
+    var /= count;
+    var -= mean*mean;
+    EXPECT_GE(mean, filler_param_.mean() - filler_param_.std() * tolerance);
+    EXPECT_LE(mean, filler_param_.mean() + filler_param_.std() * tolerance);
+    Dtype target_var = filler_param_.std() * filler_param_.std();
+    EXPECT_GE(var, target_var / tolerance);
+    EXPECT_LE(var, target_var * tolerance);
   }
   virtual ~GaussianFillerTest() { delete blob_; }
   Blob<Dtype>* const blob_;
@@ -120,41 +247,62 @@ class GaussianFillerTest : public ::testing::Test {
 TYPED_TEST_CASE(GaussianFillerTest, TestDtypes);
 
 TYPED_TEST(GaussianFillerTest, TestFill) {
-  EXPECT_TRUE(this->blob_);
-  const int count = this->blob_->count();
-  const TypeParam* data = this->blob_->cpu_data();
-  TypeParam mean = 0.;
-  TypeParam var = 0.;
-  for (int i = 0; i < count; ++i) {
-    mean += data[i];
-    var += (data[i] - this->filler_param_.mean()) *
-        (data[i] - this->filler_param_.mean());
-  }
-  mean /= count;
-  var /= count;
-  // Very loose test.
-  EXPECT_GE(mean, this->filler_param_.mean() - this->filler_param_.std() * 5);
-  EXPECT_LE(mean, this->filler_param_.mean() + this->filler_param_.std() * 5);
-  TypeParam target_var = this->filler_param_.std() * this->filler_param_.std();
-  EXPECT_GE(var, target_var / 5.);
-  EXPECT_LE(var, target_var * 5.);
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  const TypeParam tolerance = TypeParam(3);  // enough for a 120-element blob
+  this->test_params(blob_shape, tolerance);
+}
+
+TYPED_TEST(GaussianFillerTest, TestFill1D) {
+  vector<int> blob_shape(1, 25);
+  const TypeParam tolerance = TypeParam(5);
+  this->test_params(blob_shape, tolerance);
+}
+
+TYPED_TEST(GaussianFillerTest, TestFill2D) {
+  vector<int> blob_shape;
+  blob_shape.push_back(8);
+  blob_shape.push_back(3);
+  const TypeParam tolerance = TypeParam(5);
+  this->test_params(blob_shape, tolerance);
+}
+
+TYPED_TEST(GaussianFillerTest, TestFill5D) {
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  blob_shape.push_back(2);
+  const TypeParam tolerance = TypeParam(2);
+  this->test_params(blob_shape, tolerance);
 }
 
 template <typename Dtype>
 class XavierFillerTest : public ::testing::Test {
  protected:
   XavierFillerTest()
-      : blob_(new Blob<Dtype>(1000, 2, 4, 5)),
+      : blob_(new Blob<Dtype>()),
         filler_param_() {
   }
   virtual void test_params(FillerParameter_VarianceNorm variance_norm,
+      Dtype n, const vector<int>& shape, const int repetitions = 100) {
+    EXPECT_TRUE(blob_);
+    blob_->Reshape(shape);
+    for (int i = 0; i < repetitions; ++i) {
+      test_params_iter(variance_norm, n);
+    }
+  }
+  virtual void test_params_iter(FillerParameter_VarianceNorm variance_norm,
       Dtype n) {
-    this->filler_param_.set_variance_norm(variance_norm);
-    this->filler_.reset(new XavierFiller<Dtype>(this->filler_param_));
-    this->filler_->Fill(blob_);
-    EXPECT_TRUE(this->blob_);
-    const int count = this->blob_->count();
-    const Dtype* data = this->blob_->cpu_data();
+    filler_param_.set_variance_norm(variance_norm);
+    filler_.reset(new XavierFiller<Dtype>(filler_param_));
+    filler_->Fill(blob_);
+    const int count = blob_->count();
+    const Dtype* data = blob_->cpu_data();
     Dtype mean = 0.;
     Dtype ex2 = 0.;
     for (int i = 0; i < count; ++i) {
@@ -177,33 +325,92 @@ class XavierFillerTest : public ::testing::Test {
 TYPED_TEST_CASE(XavierFillerTest, TestDtypes);
 
 TYPED_TEST(XavierFillerTest, TestFillFanIn) {
+  vector<int> blob_shape;
+  blob_shape.push_back(1000);
+  blob_shape.push_back(2);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
   TypeParam n = 2*4*5;
-  this->test_params(FillerParameter_VarianceNorm_FAN_IN, n);
+  this->test_params(FillerParameter_VarianceNorm_FAN_IN, n, blob_shape);
 }
+
 TYPED_TEST(XavierFillerTest, TestFillFanOut) {
+  vector<int> blob_shape;
+  blob_shape.push_back(1000);
+  blob_shape.push_back(2);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
   TypeParam n = 1000*4*5;
-  this->test_params(FillerParameter_VarianceNorm_FAN_OUT, n);
+  this->test_params(FillerParameter_VarianceNorm_FAN_OUT, n, blob_shape);
 }
+
 TYPED_TEST(XavierFillerTest, TestFillAverage) {
+  vector<int> blob_shape;
+  blob_shape.push_back(1000);
+  blob_shape.push_back(2);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
   TypeParam n = (2*4*5 + 1000*4*5) / 2.0;
-  this->test_params(FillerParameter_VarianceNorm_AVERAGE, n);
+  this->test_params(FillerParameter_VarianceNorm_AVERAGE, n, blob_shape);
+}
+
+TYPED_TEST(XavierFillerTest, TestFill1D) {
+  // This makes little sense but at least we will know that we can fill it
+  EXPECT_TRUE(this->blob_);
+  vector<int> blob_shape(1, 25);
+  this->blob_->Reshape(blob_shape);
+  this->filler_param_.set_variance_norm(FillerParameter_VarianceNorm_AVERAGE);
+  this->filler_.reset(new XavierFiller<TypeParam>(this->filler_param_));
+  this->filler_->Fill(this->blob_);
+}
+
+TYPED_TEST(XavierFillerTest, TestFill2D) {
+  EXPECT_TRUE(this->blob_);
+  vector<int> blob_shape;
+  blob_shape.push_back(8);
+  blob_shape.push_back(3);
+  this->blob_->Reshape(blob_shape);
+  this->filler_param_.set_variance_norm(FillerParameter_VarianceNorm_AVERAGE);
+  this->filler_.reset(new XavierFiller<TypeParam>(this->filler_param_));
+  this->filler_->Fill(this->blob_);
+}
+
+TYPED_TEST(XavierFillerTest, TestFill5D) {
+  EXPECT_TRUE(this->blob_);
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  blob_shape.push_back(2);
+  this->blob_->Reshape(blob_shape);
+  this->filler_param_.set_variance_norm(FillerParameter_VarianceNorm_AVERAGE);
+  this->filler_.reset(new XavierFiller<TypeParam>(this->filler_param_));
+  this->filler_->Fill(this->blob_);
 }
 
 template <typename Dtype>
 class MSRAFillerTest : public ::testing::Test {
  protected:
   MSRAFillerTest()
-      : blob_(new Blob<Dtype>(1000, 2, 4, 5)),
+      : blob_(new Blob<Dtype>()),
         filler_param_() {
   }
   virtual void test_params(FillerParameter_VarianceNorm variance_norm,
+      Dtype n, const vector<int>& shape, const int repetitions = 100) {
+    EXPECT_TRUE(blob_);
+    blob_->Reshape(shape);
+    for (int i = 0; i < repetitions; ++i) {
+      test_params_iter(variance_norm, n);
+    }
+  }
+  virtual void test_params_iter(FillerParameter_VarianceNorm variance_norm,
       Dtype n) {
-    this->filler_param_.set_variance_norm(variance_norm);
-    this->filler_.reset(new MSRAFiller<Dtype>(this->filler_param_));
-    this->filler_->Fill(blob_);
-    EXPECT_TRUE(this->blob_);
-    const int count = this->blob_->count();
-    const Dtype* data = this->blob_->cpu_data();
+    filler_param_.set_variance_norm(variance_norm);
+    filler_.reset(new MSRAFiller<Dtype>(filler_param_));
+    filler_->Fill(blob_);
+    const int count = blob_->count();
+    const Dtype* data = blob_->cpu_data();
     Dtype mean = 0.;
     Dtype ex2 = 0.;
     for (int i = 0; i < count; ++i) {
@@ -226,16 +433,68 @@ class MSRAFillerTest : public ::testing::Test {
 TYPED_TEST_CASE(MSRAFillerTest, TestDtypes);
 
 TYPED_TEST(MSRAFillerTest, TestFillFanIn) {
+  vector<int> blob_shape;
+  blob_shape.push_back(1000);
+  blob_shape.push_back(2);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
   TypeParam n = 2*4*5;
-  this->test_params(FillerParameter_VarianceNorm_FAN_IN, n);
+  this->test_params(FillerParameter_VarianceNorm_FAN_IN, n, blob_shape);
 }
+
 TYPED_TEST(MSRAFillerTest, TestFillFanOut) {
+  vector<int> blob_shape;
+  blob_shape.push_back(1000);
+  blob_shape.push_back(2);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
   TypeParam n = 1000*4*5;
-  this->test_params(FillerParameter_VarianceNorm_FAN_OUT, n);
+  this->test_params(FillerParameter_VarianceNorm_FAN_OUT, n, blob_shape);
 }
+
 TYPED_TEST(MSRAFillerTest, TestFillAverage) {
+  vector<int> blob_shape;
+  blob_shape.push_back(1000);
+  blob_shape.push_back(2);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
   TypeParam n = (2*4*5 + 1000*4*5) / 2.0;
-  this->test_params(FillerParameter_VarianceNorm_AVERAGE, n);
+  this->test_params(FillerParameter_VarianceNorm_AVERAGE, n, blob_shape);
+}
+
+TYPED_TEST(MSRAFillerTest, TestFill1D) {
+  // Like with Xavier - no checking for correctness, just if it can be filled.
+  EXPECT_TRUE(this->blob_);
+  vector<int> blob_shape(1, 25);
+  this->blob_->Reshape(blob_shape);
+  this->filler_param_.set_variance_norm(FillerParameter_VarianceNorm_AVERAGE);
+  this->filler_.reset(new MSRAFiller<TypeParam>(this->filler_param_));
+  this->filler_->Fill(this->blob_);
+}
+
+TYPED_TEST(MSRAFillerTest, TestFill2D) {
+  EXPECT_TRUE(this->blob_);
+  vector<int> blob_shape;
+  blob_shape.push_back(8);
+  blob_shape.push_back(3);
+  this->blob_->Reshape(blob_shape);
+  this->filler_param_.set_variance_norm(FillerParameter_VarianceNorm_AVERAGE);
+  this->filler_.reset(new MSRAFiller<TypeParam>(this->filler_param_));
+  this->filler_->Fill(this->blob_);
+}
+
+TYPED_TEST(MSRAFillerTest, TestFill5D) {
+  EXPECT_TRUE(this->blob_);
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  blob_shape.push_back(2);
+  this->blob_->Reshape(blob_shape);
+  this->filler_param_.set_variance_norm(FillerParameter_VarianceNorm_AVERAGE);
+  this->filler_.reset(new MSRAFiller<TypeParam>(this->filler_param_));
+  this->filler_->Fill(this->blob_);
 }
 
 template <typename Dtype>

From eb6291911da4e6a6a9e75a8349ec3c3591d8bb2c Mon Sep 17 00:00:00 2001
From: Noiredd <snowball91b@gmail.com>
Date: Fri, 9 Mar 2018 13:14:32 +0100
Subject: [PATCH 122/144] bilinear filter test refactor

---
 src/caffe/test/test_filler.cpp | 44 ++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/src/caffe/test/test_filler.cpp b/src/caffe/test/test_filler.cpp
index 3ecec37aa03..1e6b5c21d1d 100644
--- a/src/caffe/test/test_filler.cpp
+++ b/src/caffe/test/test_filler.cpp
@@ -500,21 +500,25 @@ TYPED_TEST(MSRAFillerTest, TestFill5D) {
 template <typename Dtype>
 class BilinearFillerTest : public ::testing::Test {
  protected:
-  BilinearFillerTest() : filler_param_() {}
-  virtual void test_params(const int n) {
-    this->blob_ = new Blob<Dtype>(1000, 2, n, n);
-    this->filler_.reset(new BilinearFiller<Dtype>(this->filler_param_));
-    this->filler_->Fill(blob_);
-    EXPECT_TRUE(this->blob_);
-    const int outer_num = this->blob_->count(0, 2);
-    const int inner_num = this->blob_->count(2, 4);
-    const Dtype* data = this->blob_->cpu_data();
-    int f = ceil(this->blob_->width() / 2.);
-    Dtype c = (this->blob_->width() - 1) / (2. * f);
+  BilinearFillerTest()
+    : blob_(new Blob<Dtype>()),
+      filler_param_() {
+  }
+  virtual void test_params(const vector<int>& shape) {
+    EXPECT_TRUE(blob_);
+    blob_->Reshape(shape);
+    filler_.reset(new BilinearFiller<Dtype>(filler_param_));
+    filler_->Fill(blob_);
+    CHECK_EQ(blob_->num_axes(), 4);
+    const int outer_num = blob_->count(0, 2);
+    const int inner_num = blob_->count(2, 4);
+    const Dtype* data = blob_->cpu_data();
+    int f = ceil(blob_->shape(3) / 2.);
+    Dtype c = (blob_->shape(3) - 1) / (2. * f);
     for (int i = 0; i < outer_num; ++i) {
       for (int j = 0; j < inner_num; ++j) {
-        Dtype x = j % this->blob_->width();
-        Dtype y = (j / this->blob_->width()) % this->blob_->height();
+        Dtype x = j % blob_->shape(3);
+        Dtype y = (j / blob_->shape(3)) % blob_->shape(2);
         Dtype expected_value = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
         const Dtype actual_value = data[i * inner_num + j];
         EXPECT_NEAR(expected_value, actual_value, 0.01);
@@ -531,11 +535,21 @@ TYPED_TEST_CASE(BilinearFillerTest, TestDtypes);
 
 TYPED_TEST(BilinearFillerTest, TestFillOdd) {
   const int n = 7;
-  this->test_params(n);
+  vector<int> blob_shape;
+  blob_shape.push_back(1000);
+  blob_shape.push_back(2);
+  blob_shape.push_back(n);
+  blob_shape.push_back(n);
+  this->test_params(blob_shape);
 }
 TYPED_TEST(BilinearFillerTest, TestFillEven) {
   const int n = 6;
-  this->test_params(n);
+  vector<int> blob_shape;
+  blob_shape.push_back(1000);
+  blob_shape.push_back(2);
+  blob_shape.push_back(n);
+  blob_shape.push_back(n);
+  this->test_params(blob_shape);
 }
 
 }  // namespace caffe

From dabbc91ecda54d8d4bb69e2457289d2e7216b136 Mon Sep 17 00:00:00 2001
From: Mikhail Antonenka <anmikh@users.noreply.github.com>
Date: Sat, 17 Mar 2018 18:26:40 +0300
Subject: [PATCH 123/144] Added Swish layer (#6002)

* added swish layer (cpu)

* swish layer: added tests

* swish layer: optimized backpropogation

* swish layer: added cuda implementation

* swish layer: added beta parameter

* swish layer: incorporated sigmoid layer

* swish layer: fix comment of last added parameter

* swish layer: added REGISTER_LAYER_CLASS
---
 include/caffe/layers/swish_layer.hpp | 96 ++++++++++++++++++++++++++++
 src/caffe/layers/swish_layer.cpp     | 68 ++++++++++++++++++++
 src/caffe/layers/swish_layer.cu      | 54 ++++++++++++++++
 src/caffe/proto/caffe.proto          | 12 +++-
 src/caffe/test/test_neuron_layer.cpp | 79 +++++++++++++++++++++++
 5 files changed, 308 insertions(+), 1 deletion(-)
 create mode 100644 include/caffe/layers/swish_layer.hpp
 create mode 100644 src/caffe/layers/swish_layer.cpp
 create mode 100644 src/caffe/layers/swish_layer.cu

diff --git a/include/caffe/layers/swish_layer.hpp b/include/caffe/layers/swish_layer.hpp
new file mode 100644
index 00000000000..d538ff6de82
--- /dev/null
+++ b/include/caffe/layers/swish_layer.hpp
@@ -0,0 +1,96 @@
+#ifndef CAFFE_SWISH_LAYER_HPP_
+#define CAFFE_SWISH_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+#include "caffe/layers/sigmoid_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Swish non-linearity @f$ y = x \sigma (\beta x) @f$.
+ *        A novel activation function that tends to work better than ReLU [1].
+ *
+ * [1] Prajit Ramachandran, Barret Zoph, Quoc V. Le. "Searching for
+ *     Activation Functions". arXiv preprint arXiv:1710.05941v2 (2017).
+ */
+template <typename Dtype>
+class SwishLayer : public NeuronLayer<Dtype> {
+ public:
+  /**
+   * @param param provides SwishParameter swish_param,
+   *     with SwishLayer options:
+   *   - beta (\b optional, default 1).
+   *     the value @f$ \beta @f$ in the @f$ y = x \sigma (\beta x) @f$.
+   */
+  explicit SwishLayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param),
+        sigmoid_layer_(new SigmoidLayer<Dtype>(param)),
+        sigmoid_input_(new Blob<Dtype>()),
+        sigmoid_output_(new Blob<Dtype>()) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Swish"; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the computed outputs @f$
+   *        y = x \sigma (\beta x)
+   *      @f$.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the sigmoid inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to computed outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$; Backward fills their diff with
+   *      gradients @f$
+   *        \frac{\partial E}{\partial x}
+   *            = \frac{\partial E}{\partial y}(\beta y +
+   *              \sigma (\beta x)(1 - \beta y))
+   *      @f$ if propagate_down[0]
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// The internal SigmoidLayer
+  shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
+  /// sigmoid_input_ stores the input of the SigmoidLayer.
+  shared_ptr<Blob<Dtype> > sigmoid_input_;
+  /// sigmoid_output_ stores the output of the SigmoidLayer.
+  shared_ptr<Blob<Dtype> > sigmoid_output_;
+  /// bottom vector holder to call the underlying SigmoidLayer::Forward
+  vector<Blob<Dtype>*> sigmoid_bottom_vec_;
+  /// top vector holder to call the underlying SigmoidLayer::Forward
+  vector<Blob<Dtype>*> sigmoid_top_vec_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_SWISH_LAYER_HPP_
diff --git a/src/caffe/layers/swish_layer.cpp b/src/caffe/layers/swish_layer.cpp
new file mode 100644
index 00000000000..28935679d00
--- /dev/null
+++ b/src/caffe/layers/swish_layer.cpp
@@ -0,0 +1,68 @@
+#include <cmath>
+#include <vector>
+
+#include "caffe/layers/swish_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void SwishLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  NeuronLayer<Dtype>::LayerSetUp(bottom, top);
+  sigmoid_bottom_vec_.clear();
+  sigmoid_bottom_vec_.push_back(sigmoid_input_.get());
+  sigmoid_top_vec_.clear();
+  sigmoid_top_vec_.push_back(sigmoid_output_.get());
+  sigmoid_layer_->SetUp(sigmoid_bottom_vec_, sigmoid_top_vec_);
+}
+
+template <typename Dtype>
+void SwishLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  NeuronLayer<Dtype>::Reshape(bottom, top);
+  sigmoid_input_->ReshapeLike(*bottom[0]);
+  sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_);
+}
+
+template <typename Dtype>
+void SwishLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* sigmoid_input_data = sigmoid_input_->mutable_cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  const int count = bottom[0]->count();
+  Dtype beta = this->layer_param_.swish_param().beta();
+  caffe_copy(count, bottom_data, sigmoid_input_data);
+  caffe_scal(count, beta, sigmoid_input_data);
+  sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
+  caffe_mul(count, bottom_data, sigmoid_output_->cpu_data(), top_data);
+}
+
+template <typename Dtype>
+void SwishLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_data = top[0]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const int count = bottom[0]->count();
+    Dtype beta = this->layer_param_.swish_param().beta();
+    for (int i = 0; i < count; ++i) {
+      const Dtype swish_x = top_data[i];
+      bottom_diff[i] = top_diff[i] * (beta * swish_x + sigmoid_output_data[i]
+          * (1. - beta * swish_x));
+    }
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(SwishLayer);
+#endif
+
+INSTANTIATE_CLASS(SwishLayer);
+REGISTER_LAYER_CLASS(Swish);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/swish_layer.cu b/src/caffe/layers/swish_layer.cu
new file mode 100644
index 00000000000..c4fef53bf3a
--- /dev/null
+++ b/src/caffe/layers/swish_layer.cu
@@ -0,0 +1,54 @@
+#include <cmath>
+#include <vector>
+
+#include "caffe/layers/swish_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void SwishLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* sigmoid_input_data = sigmoid_input_->mutable_gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  Dtype beta = this->layer_param_.swish_param().beta();
+  caffe_copy(count, bottom_data, sigmoid_input_data);
+  caffe_gpu_scal(count, beta, sigmoid_input_data);
+  sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
+  caffe_gpu_mul(count, bottom_data, sigmoid_output_->gpu_data(), top_data);
+}
+
+template <typename Dtype>
+__global__ void SwishBackward(const int n, const Dtype* in_diff,
+    const Dtype* out_data, const Dtype* sigmoid_output_data, Dtype* out_diff,
+    const Dtype beta) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const Dtype swish_x = out_data[index];
+    out_diff[index] = in_diff[index] * (beta * swish_x
+        + sigmoid_output_data[index] * (1 - beta * swish_x));
+  }
+}
+
+template <typename Dtype>
+void SwishLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_data = top[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    Dtype beta = this->layer_param_.swish_param().beta();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    SwishBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, top_diff, top_data, sigmoid_output_data, bottom_diff, beta);
+    CUDA_POST_KERNEL_CHECK;
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(SwishLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 22764abc33f..b9bb3f4dffe 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -322,7 +322,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 147 (last added: recurrent_param)
+// LayerParameter next available layer-specific ID: 148 (last added: swish_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -415,6 +415,7 @@ message LayerParameter {
   optional SoftmaxParameter softmax_param = 125;
   optional SPPParameter spp_param = 132;
   optional SliceParameter slice_param = 126;
+  optional SwishParameter swish_param = 147;
   optional TanHParameter tanh_param = 127;
   optional ThresholdParameter threshold_param = 128;
   optional TileParameter tile_param = 138;
@@ -1156,6 +1157,15 @@ message SoftmaxParameter {
   optional int32 axis = 2 [default = 1];
 }
 
+// Message that stores parameters used by SwishLayer
+message SwishParameter {
+  // Beta parameter for the Swish activation function
+  // Described in:
+  // Prajit Ramachandran, Barret Zoph, Quoc V. Le. (2017). Searching for
+  // Activation Functions. https://arxiv.org/abs/1710.05941v2
+  optional float beta = 1 [default = 1];
+}
+
 message TanHParameter {
   enum Engine {
     DEFAULT = 0;
diff --git a/src/caffe/test/test_neuron_layer.cpp b/src/caffe/test/test_neuron_layer.cpp
index 180871a29ee..83d80fcd895 100644
--- a/src/caffe/test/test_neuron_layer.cpp
+++ b/src/caffe/test/test_neuron_layer.cpp
@@ -19,6 +19,7 @@
 #include "caffe/layers/prelu_layer.hpp"
 #include "caffe/layers/relu_layer.hpp"
 #include "caffe/layers/sigmoid_layer.hpp"
+#include "caffe/layers/swish_layer.hpp"
 #include "caffe/layers/tanh_layer.hpp"
 #include "caffe/layers/threshold_layer.hpp"
 
@@ -344,6 +345,84 @@ TYPED_TEST(NeuronLayerTest, TestSigmoidGradient) {
       this->blob_top_vec_);
 }
 
+TYPED_TEST(NeuronLayerTest, TestSwish) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  SwishLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+  const Dtype* top_data = this->blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_FLOAT_EQ(top_data[i], bottom_data[i] / (1. + exp(-bottom_data[i])));
+  }
+}
+
+TYPED_TEST(NeuronLayerTest, TestSwishWithBeta) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "swish_param { beta: 1.5 }", &layer_param));
+  SwishLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+  const Dtype* top_data = this->blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_FLOAT_EQ(top_data[i], bottom_data[i] / (1. + exp(-1.5 *
+        bottom_data[i])));
+  }
+}
+
+TYPED_TEST(NeuronLayerTest, TestSwishAsLinear) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "swish_param { beta: 0.0 }", &layer_param));
+  SwishLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+  const Dtype* top_data = this->blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_FLOAT_EQ(top_data[i], bottom_data[i] / 2.0);
+  }
+}
+
+TYPED_TEST(NeuronLayerTest, TestSwishGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  SwishLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(NeuronLayerTest, TestSwishWithBetaGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "swish_param { beta: 1.5 }", &layer_param));
+  SwishLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(NeuronLayerTest, TestSwishAsLinearGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "swish_param { beta: 0.0 }", &layer_param));
+  SwishLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
 TYPED_TEST(NeuronLayerTest, TestTanH) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;

From d09f157475a51b74ac5ba29764e7b9bde8991097 Mon Sep 17 00:00:00 2001
From: twmht <qrnnis2623891@gmail.com>
Date: Fri, 23 Mar 2018 22:21:06 +0800
Subject: [PATCH 124/144] check embed index in debug mode

---
 src/caffe/layers/embed_layer.cu | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/caffe/layers/embed_layer.cu b/src/caffe/layers/embed_layer.cu
index 6324a3a8937..3cf39fd9983 100644
--- a/src/caffe/layers/embed_layer.cu
+++ b/src/caffe/layers/embed_layer.cu
@@ -15,6 +15,11 @@ __global__ void EmbedForward(const int nthreads, const Dtype* bottom_data,
     const int n = top_index / N;
     const int d = top_index % N;
     const int index = static_cast<int>(bottom_data[n]);
+    #ifdef DEBUG
+        assert(index >= 0);
+        assert(index < K);
+        assert(static_cast<Dtype>(index) == bottom_data[n]);
+    #endif
     const int weight_index = index * N + d;
     top_data[top_index] = weight[weight_index];
   }

From 08b3308e163e2af67a6a391befc806a0b19c2e17 Mon Sep 17 00:00:00 2001
From: Seyyed Hossein Hasanpour <coderx7@gmail.com>
Date: Mon, 9 Apr 2018 10:06:49 +0430
Subject: [PATCH 125/144] Minor correction concerning compilation compatibility
 with CUDA 9.0

Since CUDA 9.0 doesn't support sm_20 and sm_21 anymore. This PR allows Caffe to compile with CUDA 9.0 and newer versions successfully.
This addresses the discussion concerning [#6237](https://github.com/BVLC/caffe/pull/6237) in [master (#6237)](https://github.com/BVLC/caffe/commit/cb150eca6d593ddb85e53acd05b8dcf709ea8337#diff-2004a3d3e6b4ed2e2812bb0b4b998f18)
---
 cmake/Cuda.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index 9325674a6a0..9ced9f9d591 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -109,8 +109,8 @@ function(caffe_select_nvcc_arch_flags out_variable)
   set(__nvcc_flags "")
   set(__nvcc_archs_readable "")
 
-  string(COMPARE GREATER_EQUAL "${CUDA_VERSION}" "9.1" iscudanewerthan91)
-  if(iscudanewerthan91)
+  string(COMPARE GREATER_EQUAL "${CUDA_VERSION}" "9.0" iscudanewerthan90)
+  if(iscudanewerthan90)
     string(REPLACE "21(20)" "" __cuda_arch_bin "${__cuda_arch_bin}")
     string(REPLACE "20" "" __cuda_arch_bin "${__cuda_arch_bin}")
   endif()

From c46fc00fcb5434b2ecaeac792ed243f4144c81d8 Mon Sep 17 00:00:00 2001
From: Viktor Richter <vrichter@posteo.de>
Date: Mon, 9 Apr 2018 16:51:04 +0200
Subject: [PATCH 126/144] Fix cmake < v3.7 compatibility in Cuda.cmake (#6338)

Fix for compatibility with CMake <3.7 (related conversation under 37e4289)

* Fix cmake < v3.7 compatibility in Cuda.cmake

* Fix version test variable naming in Cuda.cmake
---
 cmake/Cuda.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index 9ced9f9d591..e03feabffcb 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -109,8 +109,8 @@ function(caffe_select_nvcc_arch_flags out_variable)
   set(__nvcc_flags "")
   set(__nvcc_archs_readable "")
 
-  string(COMPARE GREATER_EQUAL "${CUDA_VERSION}" "9.0" iscudanewerthan90)
-  if(iscudanewerthan90)
+  string(COMPARE LESS "${CUDA_VERSION}" "9.0" iscudaolderthan90)
+  if(NOT iscudaolderthan90)
     string(REPLACE "21(20)" "" __cuda_arch_bin "${__cuda_arch_bin}")
     string(REPLACE "20" "" __cuda_arch_bin "${__cuda_arch_bin}")
   endif()

From 106bfcfde6ae5627bdffecde83023212cab23c18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Przemys=C5=82aw=20Dolata?= <snowball91b@gmail.com>
Date: Wed, 11 Apr 2018 00:20:12 +0200
Subject: [PATCH 127/144] Revised guidelines for GitHub issues (#6327)

revised guidelines for GitHub issues and caffe-users posts
---
 .github/ISSUE_TEMPLATE.md | 37 ++++++++++++++++++--------
 CONTRIBUTING.md           | 56 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
index d78a3dc3455..c981f62f7a1 100644
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -1,19 +1,34 @@
-Please use the [caffe-users list](https://groups.google.com/forum/#!forum/caffe-users) for usage, installation, or modeling questions, or other requests for help.
-_Do not post such requests to Issues._ Doing so interferes with the development of Caffe.
+## Important - read before submitting
 
-Please read the [guidelines for contributing](https://github.com/BVLC/caffe/blob/master/CONTRIBUTING.md) before submitting this issue.
+*Please read the [guidelines for contributing](https://github.com/BVLC/caffe/blob/master/CONTRIBUTING.md) before submitting this issue!*
+
+*Please do not post installation, build, usage, or modeling questions, or other requests for help to Issues.*
+Use the [caffe-users list](https://groups.google.com/forum/#!forum/caffe-users) instead.
+This helps developers maintain a clear, uncluttered, and efficient view of the state of Caffe.
 
 ### Issue summary
 
 
 ### Steps to reproduce
 
-If you are having difficulty building Caffe or training a model, please ask the caffe-users mailing list. If you are reporting a build error that seems to be due to a bug in Caffe, please attach your build configuration (either Makefile.config or CMakeCache.txt) and the output of the make (or cmake) command.
 
-### Your system configuration
-Operating system:
-Compiler:
-CUDA version (if applicable):
-CUDNN version (if applicable):
-BLAS:
-Python or MATLAB version (for pycaffe and matcaffe respectively):
+### Tried solutions
+
+
+### System configuration
+
+* Operating system: 
+* Compiler: 
+* CUDA version (if applicable): 
+* CUDNN version (if applicable): 
+* BLAS: 
+* Python version (if using pycaffe): 
+* MATLAB version (if using matcaffe): 
+
+### Issue checklist
+
+- [ ] read the guidelines and removed the first paragraph
+- [ ] written a short summary and detailed steps to reproduce
+- [ ] explained how solutions to related problems failed (tick if found none)
+- [ ] filled system configuration
+- [ ] attached relevant logs/config files (tick if not applicable)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8cd5e56ca49..45f7e186e4f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,21 +1,63 @@
 # Contributing
 
+Below you will find a collection of guidelines for submitting issues as well as contributing code to the Caffe repository.
+Please read those before starting an issue or a pull request.
+
 ## Issues
 
 Specific Caffe design and development issues, bugs, and feature requests are maintained by GitHub Issues.
 
-_Please do not post usage, installation, or modeling questions, or other requests for help to Issues._
-Use the [caffe-users list](https://groups.google.com/forum/#!forum/caffe-users) instead. This helps developers maintain a clear, uncluttered, and efficient view of the state of Caffe.
-
-When reporting a bug, it's most helpful to provide the following information, where applicable:
+*Please do not post installation, build, usage, or modeling questions, or other requests for help to Issues.*
+Use the [caffe-users list](https://groups.google.com/forum/#!forum/caffe-users) instead.
+This helps developers maintain a clear, uncluttered, and efficient view of the state of Caffe.
+See the chapter [caffe-users](#caffe-users) below for guidance on posting to the users list.
 
-* What steps reproduce the bug?
-* Can you reproduce the bug using the latest [master](https://github.com/BVLC/caffe/tree/master), compiled with the `DEBUG` make option?
-* What hardware and operating system/distribution are you running?
+When reporting an issue, it's most helpful to provide the following information, where applicable:
+* How does the problem look like and what steps reproduce it?
+* Can you reproduce it using the latest [master](https://github.com/BVLC/caffe/tree/master), compiled with the `DEBUG` make option?
+* What hardware and software are you running? In particular:
+	* GPU make and model, if relevant,
+	* operating system/distribution,
+	* compiler; please also post which version (for example, with GCC run `gcc --version` to check),
+	* CUDA version, if applicable (run `nvcc --version` to check),
+	* cuDNN version, if applicable (version number is stored in `cudnn.h`, look for lines containing `CUDNN_MAJOR`, `CUDNN_MINOR` and `CUDNN_PATCHLEVEL`),
+	* BLAS library,
+	* Python version, if relevant,
+	* MATLAB version, if relevant.
+* **What have you already tried** to solve the problem? How did it fail? Are there any other issues related to yours?
+* If this is not a build-related issue, does your installation pass `make runtest`?
 * If the bug is a crash, provide the backtrace (usually printed by Caffe; always obtainable with `gdb`).
+* If you are reporting a build error that seems to be due to a bug in Caffe, please attach your build configuration (either Makefile.config or CMakeCache.txt) and the output of the make (or cmake) command.
+
+If only a small portion of the code/log is relevant to your issue, you may paste it directly into the post, preferably using Markdown syntax for code block: triple backtick ( \`\`\` ) to open/close a block.
+In other cases (multiple files, or long files), please **attach** them to the post - this greatly improves readability.
+
+If the problem arises during a complex operation (e.g. large script using pycaffe, long network prototxt), please reduce the example to the minimal size that still causes the error.
+Also, minimize influence of external modules, data etc. - this way it will be easier for others to understand and reproduce your issue, and eventually help you.
+Sometimes you will find the root cause yourself in the process.
 
 Try to give your issue a title that is succinct and specific. The devs will rename issues as needed to keep track of them.
 
+## Caffe-users
+
+Before you post to the [caffe-users list](https://groups.google.com/forum/#!forum/caffe-users), make sure you look for existing solutions.
+The Caffe community has encountered and found solutions to countless problems - benefit from the collective experience.
+Recommended places to look:
+* the [users list](https://groups.google.com/forum/#!forum/caffe-users) itself,
+* [`caffe`](https://stackoverflow.com/questions/tagged/caffe) tag on StackOverflow,
+* [GitHub issues](https://github.com/BVLC/caffe/issues) tracker (some problems have been answered there),
+* the public [wiki](https://github.com/BVLC/caffe/wiki),
+* the official [documentation](http://caffe.berkeleyvision.org/).
+
+Found a post/issue with your exact problem, but with no answer?
+Don't just leave a "me too" message - provide the details of your case.
+Problems with more available information are easier to solve and attract good attention.
+
+When posting to the list, make sure you provide as much relevant information as possible - recommendations for an issue report (see above) are a good starting point.  
+*Please make it very clear which version of Caffe you are using, especially if it is a fork not maintained by BVLC.*
+
+Formatting recommendations hold: paste short logs/code fragments into the post (use fixed-width text for them), **attach** long logs or multiple files.
+
 ## Pull Requests
 
 Caffe welcomes all contributions.

From 356a6cca958a9108649a3cd824d316b679fcefb2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Przemys=C5=82aw=20Dolata?= <snowball91b@gmail.com>
Date: Wed, 11 Apr 2018 15:32:25 +0200
Subject: [PATCH 128/144] tweaked Gaussian filler tests for less false fails

---
 src/caffe/test/test_filler.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/caffe/test/test_filler.cpp b/src/caffe/test/test_filler.cpp
index 1e6b5c21d1d..34f7007d9c7 100644
--- a/src/caffe/test/test_filler.cpp
+++ b/src/caffe/test/test_filler.cpp
@@ -257,16 +257,16 @@ TYPED_TEST(GaussianFillerTest, TestFill) {
 }
 
 TYPED_TEST(GaussianFillerTest, TestFill1D) {
-  vector<int> blob_shape(1, 25);
-  const TypeParam tolerance = TypeParam(5);
+  vector<int> blob_shape(1, 125);
+  const TypeParam tolerance = TypeParam(3);
   this->test_params(blob_shape, tolerance);
 }
 
 TYPED_TEST(GaussianFillerTest, TestFill2D) {
   vector<int> blob_shape;
   blob_shape.push_back(8);
-  blob_shape.push_back(3);
-  const TypeParam tolerance = TypeParam(5);
+  blob_shape.push_back(15);
+  const TypeParam tolerance = TypeParam(3);
   this->test_params(blob_shape, tolerance);
 }
 

From 3318a466309a82d3d63f3b33f3663824da3f1ceb Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Mon, 16 Apr 2018 02:26:08 -0700
Subject: [PATCH 129/144] Cherry-picked USE_HDF5 from Android branch

---
 CMakeLists.txt                            |  3 +++
 Makefile                                  | 12 +++++++++++-
 Makefile.config.example                   |  2 ++
 cmake/ConfigGen.cmake                     | 12 ++++++++++++
 cmake/Dependencies.cmake                  |  8 ++++++++
 cmake/Summary.cmake                       |  2 ++
 include/caffe/util/hdf5.hpp               |  2 ++
 src/caffe/layers/hdf5_data_layer.cpp      |  2 ++
 src/caffe/layers/hdf5_data_layer.cu       |  2 ++
 src/caffe/layers/hdf5_output_layer.cpp    |  2 ++
 src/caffe/layers/hdf5_output_layer.cu     |  2 ++
 src/caffe/net.cpp                         | 13 +++++++++++++
 src/caffe/solvers/sgd_solver.cpp          | 12 ++++++++++++
 src/caffe/test/test_hdf5_output_layer.cpp |  2 ++
 src/caffe/test/test_hdf5data_layer.cpp    |  2 ++
 src/caffe/util/hdf5.cpp                   |  2 ++
 16 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 08f56a33a59..27d172f900b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,6 +42,9 @@ caffe_option(USE_LMDB "Build with lmdb" ON)
 caffe_option(ALLOW_LMDB_NOLOCK "Allow MDB_NOLOCK when reading LMDB files (only if necessary)" OFF)
 caffe_option(USE_OPENMP "Link with OpenMP (when your BLAS wants OpenMP and you get linker errors)" OFF)
 
+# This code is taken from https://github.com/sh1r0/caffe-android-lib
+caffe_option(USE_HDF5 "Build with hdf5" ON)
+
 # ---[ Dependencies
 include(cmake/Dependencies.cmake)
 
diff --git a/Makefile b/Makefile
index c85c695acff..29ea8a69a61 100644
--- a/Makefile
+++ b/Makefile
@@ -178,11 +178,13 @@ ifneq ($(CPU_ONLY), 1)
 	LIBRARIES := cudart cublas curand
 endif
 
-LIBRARIES += glog gflags protobuf boost_system boost_filesystem m hdf5_hl hdf5
+LIBRARIES += glog gflags protobuf boost_system boost_filesystem m
 
 # handle IO dependencies
 USE_LEVELDB ?= 1
 USE_LMDB ?= 1
+# This code is taken from https://github.com/sh1r0/caffe-android-lib
+USE_HDF5 ?= 1
 USE_OPENCV ?= 1
 
 ifeq ($(USE_LEVELDB), 1)
@@ -191,6 +193,10 @@ endif
 ifeq ($(USE_LMDB), 1)
 	LIBRARIES += lmdb
 endif
+# This code is taken from https://github.com/sh1r0/caffe-android-lib
+ifeq ($(USE_HDF5), 1)
+	LIBRARIES += hdf5_hl hdf5
+endif
 ifeq ($(USE_OPENCV), 1)
 	LIBRARIES += opencv_core opencv_highgui opencv_imgproc
 
@@ -347,6 +353,10 @@ ifeq ($(ALLOW_LMDB_NOLOCK), 1)
 	COMMON_FLAGS += -DALLOW_LMDB_NOLOCK
 endif
 endif
+# This code is taken from https://github.com/sh1r0/caffe-android-lib
+ifeq ($(USE_HDF5), 1)
+	COMMON_FLAGS += -DUSE_HDF5
+endif
 
 # CPU-only configuration
 ifeq ($(CPU_ONLY), 1)
diff --git a/Makefile.config.example b/Makefile.config.example
index 79905935f15..24ca632783a 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -11,6 +11,8 @@
 # USE_OPENCV := 0
 # USE_LEVELDB := 0
 # USE_LMDB := 0
+# This code is taken from https://github.com/sh1r0/caffe-android-lib
+# USE_HDF5 := 0
 
 # uncomment to allow MDB_NOLOCK when reading LMDB files (only if necessary)
 #	You should not set this flag if you will be reading LMDBs with any
diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake
index 09bb09b4ff2..69889c243b2 100644
--- a/cmake/ConfigGen.cmake
+++ b/cmake/ConfigGen.cmake
@@ -24,6 +24,18 @@ function(caffe_generate_export_configs)
     set(HAVE_CUDA FALSE)
   endif()
 
+  set(HDF5_IMPORTED OFF)
+  foreach(_lib ${HDF5_LIBRARIES} ${HDF5_HL_LIBRARIES})
+    if(TARGET ${_lib})
+      set(HDF5_IMPORTED ON)
+    endif()
+  endforeach()
+
+  # This code is taken from https://github.com/sh1r0/caffe-android-lib
+  if(USE_HDF5)
+    list(APPEND Caffe_DEFINITIONS -DUSE_HDF5)
+  endif()
+
   if(NOT HAVE_CUDNN)
     set(HAVE_CUDNN FALSE)
   endif()
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index c48255c89f2..ca2e3ad9e5e 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -47,6 +47,14 @@ find_package(HDF5 COMPONENTS HL REQUIRED)
 list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${HDF5_INCLUDE_DIRS})
 list(APPEND Caffe_LINKER_LIBS PUBLIC ${HDF5_LIBRARIES} ${HDF5_HL_LIBRARIES})
 
+# This code is taken from https://github.com/sh1r0/caffe-android-lib
+if(USE_HDF5)
+  find_package(HDF5 COMPONENTS HL REQUIRED)
+  include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR})
+  list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES} ${HDF5_HL_LIBRARIES})
+  add_definitions(-DUSE_HDF5)
+endif()
+
 # ---[ LMDB
 if(USE_LMDB)
   find_package(LMDB REQUIRED)
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index ed8c25268db..40b8c2f2966 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -119,6 +119,8 @@ function(caffe_print_configuration_summary)
   caffe_status("  USE_LMDB          :   ${USE_LMDB}")
   caffe_status("  USE_NCCL          :   ${USE_NCCL}")
   caffe_status("  ALLOW_LMDB_NOLOCK :   ${ALLOW_LMDB_NOLOCK}")
+  # This code is taken from https://github.com/sh1r0/caffe-android-lib
+  caffe_status("  USE_HDF5          :   ${USE_HDF5}")
   caffe_status("")
   caffe_status("Dependencies:")
   caffe_status("  BLAS              : " APPLE THEN "Yes (vecLib)" ELSE "Yes (${BLAS})")
diff --git a/include/caffe/util/hdf5.hpp b/include/caffe/util/hdf5.hpp
index 71549c1cc02..dbd8bb6c5e4 100644
--- a/include/caffe/util/hdf5.hpp
+++ b/include/caffe/util/hdf5.hpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
 #ifndef CAFFE_UTIL_HDF5_H_
 #define CAFFE_UTIL_HDF5_H_
 
@@ -37,3 +38,4 @@ string hdf5_get_name_by_idx(hid_t loc_id, int idx);
 }  // namespace caffe
 
 #endif   // CAFFE_UTIL_HDF5_H_
+#endif   // USE_HDF5
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 00716a92b15..7668854cc1f 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
 /*
 TODO:
 - load file in a separate thread ("prefetch")
@@ -184,3 +185,4 @@ INSTANTIATE_CLASS(HDF5DataLayer);
 REGISTER_LAYER_CLASS(HDF5Data);
 
 }  // namespace caffe
+#endif  // USE_HDF5
diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu
index 33eebd41dfc..70cd9f32f85 100644
--- a/src/caffe/layers/hdf5_data_layer.cu
+++ b/src/caffe/layers/hdf5_data_layer.cu
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
 /*
 TODO:
 - only load parts of the file, in accordance with a prototxt param "max_mem"
@@ -34,3 +35,4 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 INSTANTIATE_LAYER_GPU_FUNCS(HDF5DataLayer);
 
 }  // namespace caffe
+#endif  // USE_HDF5
diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp
index f8f1edcd18e..28c453a20fd 100644
--- a/src/caffe/layers/hdf5_output_layer.cpp
+++ b/src/caffe/layers/hdf5_output_layer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
 #include <vector>
 
 #include "hdf5.h"
@@ -72,3 +73,4 @@ INSTANTIATE_CLASS(HDF5OutputLayer);
 REGISTER_LAYER_CLASS(HDF5Output);
 
 }  // namespace caffe
+#endif  // USE_HDF5
diff --git a/src/caffe/layers/hdf5_output_layer.cu b/src/caffe/layers/hdf5_output_layer.cu
index c1685cd34a7..891aea03862 100644
--- a/src/caffe/layers/hdf5_output_layer.cu
+++ b/src/caffe/layers/hdf5_output_layer.cu
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
 #include <vector>
 
 #include "hdf5.h"
@@ -37,3 +38,4 @@ void HDF5OutputLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 INSTANTIATE_LAYER_GPU_FUNCS(HDF5OutputLayer);
 
 }  // namespace caffe
+#endif  // USE_HDF5
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 353c2f95b9e..73adcc6dba1 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -5,7 +5,9 @@
 #include <utility>
 #include <vector>
 
+#ifdef USE_HDF5
 #include "hdf5.h"
+#endif  // USE_HDF5
 
 #include "caffe/common.hpp"
 #include "caffe/layer.hpp"
@@ -786,6 +788,7 @@ void Net<Dtype>::CopyTrainedLayersFromBinaryProto(
 
 template <typename Dtype>
 void Net<Dtype>::CopyTrainedLayersFromHDF5(const string trained_filename) {
+#ifdef USE_HDF5
   hid_t file_hid = H5Fopen(trained_filename.c_str(), H5F_ACC_RDONLY,
                            H5P_DEFAULT);
   CHECK_GE(file_hid, 0) << "Couldn't open " << trained_filename;
@@ -832,6 +835,10 @@ void Net<Dtype>::CopyTrainedLayersFromHDF5(const string trained_filename) {
   }
   H5Gclose(data_hid);
   H5Fclose(file_hid);
+#else
+  LOG(FATAL) << "CopyTrainedLayersFromHDF5 requires hdf5;"
+             << " compile with USE_HDF5.";
+#endif  // USE_HDF5
 }
 
 template <typename Dtype>
@@ -848,6 +855,8 @@ void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) const {
 
 template <typename Dtype>
 void Net<Dtype>::ToHDF5(const string& filename, bool write_diff) const {
+// This code is taken from https://github.com/sh1r0/caffe-android-lib
+#ifdef USE_HDF5
   hid_t file_hid = H5Fcreate(filename.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT,
       H5P_DEFAULT);
   CHECK_GE(file_hid, 0)
@@ -901,6 +910,10 @@ void Net<Dtype>::ToHDF5(const string& filename, bool write_diff) const {
     H5Gclose(diff_hid);
   }
   H5Fclose(file_hid);
+// This code is taken from https://github.com/sh1r0/caffe-android-lib
+#else
+  LOG(FATAL) << "ToHDF5 requires hdf5; compile with USE_HDF5.";
+#endif  // USE_HDF5
 }
 
 template <typename Dtype>
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index 1d52beb0636..b11a8f41f12 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -285,6 +285,8 @@ void SGDSolver<Dtype>::SnapshotSolverStateToBinaryProto(
 template <typename Dtype>
 void SGDSolver<Dtype>::SnapshotSolverStateToHDF5(
     const string& model_filename) {
+// This code is taken from https://github.com/sh1r0/caffe-android-lib
+#ifdef USE_HDF5
   string snapshot_filename =
       Solver<Dtype>::SnapshotFilename(".solverstate.h5");
   LOG(INFO) << "Snapshotting solver state to HDF5 file " << snapshot_filename;
@@ -306,6 +308,11 @@ void SGDSolver<Dtype>::SnapshotSolverStateToHDF5(
   }
   H5Gclose(history_hid);
   H5Fclose(file_hid);
+// This code is taken from https://github.com/sh1r0/caffe-android-lib
+#else
+  LOG(FATAL) << "SnapshotSolverStateToHDF5 requires hdf5;"
+             << " compile with USE_HDF5.";
+#endif  // USE_HDF5
 }
 
 template <typename Dtype>
@@ -330,6 +337,7 @@ void SGDSolver<Dtype>::RestoreSolverStateFromBinaryProto(
 
 template <typename Dtype>
 void SGDSolver<Dtype>::RestoreSolverStateFromHDF5(const string& state_file) {
+#ifdef USE_HDF5
   hid_t file_hid = H5Fopen(state_file.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
   CHECK_GE(file_hid, 0) << "Couldn't open solver state file " << state_file;
   this->iter_ = hdf5_load_int(file_hid, "iter");
@@ -351,6 +359,10 @@ void SGDSolver<Dtype>::RestoreSolverStateFromHDF5(const string& state_file) {
   }
   H5Gclose(history_hid);
   H5Fclose(file_hid);
+#else
+  LOG(FATAL) << "RestoreSolverStateFromHDF5 requires hdf5;"
+             << " compile with USE_HDF5.";
+#endif  // USE_HDF5
 }
 
 INSTANTIATE_CLASS(SGDSolver);
diff --git a/src/caffe/test/test_hdf5_output_layer.cpp b/src/caffe/test/test_hdf5_output_layer.cpp
index f94dd57e7de..11d52310cad 100644
--- a/src/caffe/test/test_hdf5_output_layer.cpp
+++ b/src/caffe/test/test_hdf5_output_layer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
 #include <string>
 #include <vector>
 
@@ -120,3 +121,4 @@ TYPED_TEST(HDF5OutputLayerTest, TestForward) {
 }
 
 }  // namespace caffe
+#endif  // USE_HDF5
diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp
index 3977c4866c7..0e5c398f966 100644
--- a/src/caffe/test/test_hdf5data_layer.cpp
+++ b/src/caffe/test/test_hdf5data_layer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
 #include <string>
 #include <vector>
 
@@ -163,3 +164,4 @@ TYPED_TEST(HDF5DataLayerTest, TestSkip) {
 }
 
 }  // namespace caffe
+#endif  // USE_HDF5
diff --git a/src/caffe/util/hdf5.cpp b/src/caffe/util/hdf5.cpp
index ed73742937f..cefd853dff4 100644
--- a/src/caffe/util/hdf5.cpp
+++ b/src/caffe/util/hdf5.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
 #include "caffe/util/hdf5.hpp"
 
 #include <string>
@@ -207,3 +208,4 @@ string hdf5_get_name_by_idx(hid_t loc_id, int idx) {
 }
 
 }  // namespace caffe
+#endif  // USE_HDF5

From 0536720f41ff6fd43b98a0eb9eb7cd8f0ece5d1e Mon Sep 17 00:00:00 2001
From: Kuang Fangjun <csukuangfj@gmail.com>
Date: Tue, 8 May 2018 10:36:44 +0800
Subject: [PATCH 130/144] fix issue #6387.

---
 include/caffe/syncedmem.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 317ce29a257..8d650a34a8e 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -66,8 +66,8 @@ class SyncedMemory {
   void* mutable_cpu_data();
   void* mutable_gpu_data();
   enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
-  SyncedHead head() { return head_; }
-  size_t size() { return size_; }
+  SyncedHead head() const { return head_; }
+  size_t size() const { return size_; }
 
 #ifndef CPU_ONLY
   void async_gpu_push(const cudaStream_t& stream);

From 6d912a32bedd4daf6e0f0c3a2622cf4d382ed759 Mon Sep 17 00:00:00 2001
From: Kuang Fangjun <csukuangfj@gmail.com>
Date: Tue, 8 May 2018 10:46:54 +0800
Subject: [PATCH 131/144] fix issue #6389

---
 src/caffe/test/test_syncedmem.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/caffe/test/test_syncedmem.cpp b/src/caffe/test/test_syncedmem.cpp
index 16dfb58230f..2ca9ca2f998 100644
--- a/src/caffe/test/test_syncedmem.cpp
+++ b/src/caffe/test/test_syncedmem.cpp
@@ -80,7 +80,7 @@ TEST_F(SyncedMemoryTest, TestGPURead) {
   char* recovered_value = new char[10];
   caffe_gpu_memcpy(10, gpu_data, recovered_value);
   for (int i = 0; i < mem.size(); ++i) {
-    EXPECT_EQ((static_cast<char*>(recovered_value))[i], 1);
+    EXPECT_EQ(recovered_value[i], 1);
   }
   // do another round
   cpu_data = mem.mutable_cpu_data();
@@ -94,7 +94,7 @@ TEST_F(SyncedMemoryTest, TestGPURead) {
   // check if values are the same
   caffe_gpu_memcpy(10, gpu_data, recovered_value);
   for (int i = 0; i < mem.size(); ++i) {
-    EXPECT_EQ((static_cast<char*>(recovered_value))[i], 2);
+    EXPECT_EQ(recovered_value[i], 2);
   }
   delete[] recovered_value;
 }

From cc1c8fb465fbf48e3048659ca5aa407561df7687 Mon Sep 17 00:00:00 2001
From: Mitar <mitar.git@tnode.com>
Date: Sat, 28 Oct 2017 02:35:17 -0700
Subject: [PATCH 132/144] [pycaffe] expose solver update to do manual solving

a sketch of `solver.step()` done out manually:

1. `solver.net.forward()`
2. `solver.net.backward()`
3. `solver.net.apply_update()`
4. `solver.net.clear_param_diffs()`
---
 include/caffe/sgd_solvers.hpp | 3 ++-
 include/caffe/solver.hpp      | 3 ++-
 python/caffe/_caffe.cpp       | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/caffe/sgd_solvers.hpp b/include/caffe/sgd_solvers.hpp
index 1fc52d87137..f1819bb62dc 100644
--- a/include/caffe/sgd_solvers.hpp
+++ b/include/caffe/sgd_solvers.hpp
@@ -23,10 +23,11 @@ class SGDSolver : public Solver<Dtype> {
 
   const vector<shared_ptr<Blob<Dtype> > >& history() { return history_; }
 
+  virtual void ApplyUpdate();
+
  protected:
   void PreSolve();
   Dtype GetLearningRate();
-  virtual void ApplyUpdate();
   virtual void Normalize(int param_id);
   virtual void Regularize(int param_id);
   virtual void ComputeUpdateValue(int param_id, Dtype rate);
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index a28d8cb897e..75560f9fd08 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -94,9 +94,10 @@ class Solver {
    */
   virtual inline const char* type() const { return ""; }
 
- protected:
   // Make and apply the update value for the current iteration.
   virtual void ApplyUpdate() = 0;
+
+ protected:
   string SnapshotFilename(const string extension);
   string SnapshotToBinaryProto();
   string SnapshotToHDF5();
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index 72659a4f44e..eed16c29c84 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -507,6 +507,7 @@ BOOST_PYTHON_MODULE(_caffe) {
     .def("restore", &Solver<Dtype>::Restore)
     .def("snapshot", &Solver<Dtype>::Snapshot)
     .def("share_weights", &share_weights)
+    .def("apply_update", &Solver<Dtype>::ApplyUpdate)
     .add_property("param", bp::make_function(&Solver<Dtype>::param,
               bp::return_value_policy<bp::copy_const_reference>()));
   BP_REGISTER_SHARED_PTR_TO_PYTHON(Solver<Dtype>);

From c74913d4b5b6c120e40bfeaab43fde45acc22c24 Mon Sep 17 00:00:00 2001
From: Mitar <mitar.git@tnode.com>
Date: Sat, 28 Oct 2017 02:52:35 -0700
Subject: [PATCH 133/144] increment iteration during update, not step

with update exposed it is important to increment the iteration when an
update is made, whether by step or update alone. more fundementally,
it's the update that defines an iterationa, so this is a natural place
for the increment.
---
 src/caffe/solver.cpp             | 4 ----
 src/caffe/solvers/sgd_solver.cpp | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index d229acff485..bf27beeed41 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -266,10 +266,6 @@ void Solver<Dtype>::Step(int iters) {
     }
     ApplyUpdate();
 
-    // Increment the internal iter_ counter -- its value should always indicate
-    // the number of times the weights have been updated.
-    ++iter_;
-
     SolverAction::Enum request = GetRequestedAction();
 
     // Save a snapshot if needed.
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index 1d52beb0636..a56a2d0bf51 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -120,6 +120,10 @@ void SGDSolver<Dtype>::ApplyUpdate() {
     ComputeUpdateValue(param_id, rate);
   }
   this->net_->Update();
+
+  // Increment the internal iter_ counter -- its value should always indicate
+  // the number of times the weights have been updated.
+  ++this->iter_;
 }
 
 template <typename Dtype>

From cfcf74fecf2e2b9d35974c5b0a0554921ad66984 Mon Sep 17 00:00:00 2001
From: Mitar <mitar.git@tnode.com>
Date: Sun, 10 Dec 2017 03:11:26 -0800
Subject: [PATCH 134/144] [pycaffe] expose mutable solver parameter, base lr,
 and effective lr

`solver.lr` is the effective learning rate in use while `solver.base_lr`
is the configured learning rate at initialization. the solver parameter
is now editable for setting fields that are in use throughout the
lifetime of the solver, such as the maximum iteration.
---
 include/caffe/sgd_solvers.hpp |  2 +-
 python/caffe/_caffe.cpp       | 19 +++++++++++--------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/include/caffe/sgd_solvers.hpp b/include/caffe/sgd_solvers.hpp
index f1819bb62dc..925ff78331e 100644
--- a/include/caffe/sgd_solvers.hpp
+++ b/include/caffe/sgd_solvers.hpp
@@ -24,10 +24,10 @@ class SGDSolver : public Solver<Dtype> {
   const vector<shared_ptr<Blob<Dtype> > >& history() { return history_; }
 
   virtual void ApplyUpdate();
+  Dtype GetLearningRate();
 
  protected:
   void PreSolve();
-  Dtype GetLearningRate();
   virtual void Normalize(int param_id);
   virtual void Regularize(int param_id);
   virtual void ComputeUpdateValue(int param_id, Dtype rate);
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index eed16c29c84..9e7f61402c4 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -490,7 +490,9 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::class_<SolverParameter>("SolverParameter", bp::no_init)
     .add_property("max_iter", &SolverParameter::max_iter)
     .add_property("display", &SolverParameter::display)
-    .add_property("layer_wise_reduce", &SolverParameter::layer_wise_reduce);
+    .add_property("layer_wise_reduce", &SolverParameter::layer_wise_reduce)
+    .add_property("base_lr", &SolverParameter::base_lr,
+           &SolverParameter::set_base_lr);
   bp::class_<LayerParameter>("LayerParameter", bp::no_init);
 
   bp::class_<Solver<Dtype>, shared_ptr<Solver<Dtype> >, boost::noncopyable>(
@@ -509,25 +511,26 @@ BOOST_PYTHON_MODULE(_caffe) {
     .def("share_weights", &share_weights)
     .def("apply_update", &Solver<Dtype>::ApplyUpdate)
     .add_property("param", bp::make_function(&Solver<Dtype>::param,
-              bp::return_value_policy<bp::copy_const_reference>()));
+              bp::return_internal_reference<>()));
   BP_REGISTER_SHARED_PTR_TO_PYTHON(Solver<Dtype>);
 
   bp::class_<SGDSolver<Dtype>, bp::bases<Solver<Dtype> >,
     shared_ptr<SGDSolver<Dtype> >, boost::noncopyable>(
-        "SGDSolver", bp::init<string>());
-  bp::class_<NesterovSolver<Dtype>, bp::bases<Solver<Dtype> >,
+        "SGDSolver", bp::init<string>())
+        .add_property("lr", &SGDSolver<Dtype>::GetLearningRate);
+  bp::class_<NesterovSolver<Dtype>, bp::bases<SGDSolver<Dtype> >,
     shared_ptr<NesterovSolver<Dtype> >, boost::noncopyable>(
         "NesterovSolver", bp::init<string>());
-  bp::class_<AdaGradSolver<Dtype>, bp::bases<Solver<Dtype> >,
+  bp::class_<AdaGradSolver<Dtype>, bp::bases<SGDSolver<Dtype> >,
     shared_ptr<AdaGradSolver<Dtype> >, boost::noncopyable>(
         "AdaGradSolver", bp::init<string>());
-  bp::class_<RMSPropSolver<Dtype>, bp::bases<Solver<Dtype> >,
+  bp::class_<RMSPropSolver<Dtype>, bp::bases<SGDSolver<Dtype> >,
     shared_ptr<RMSPropSolver<Dtype> >, boost::noncopyable>(
         "RMSPropSolver", bp::init<string>());
-  bp::class_<AdaDeltaSolver<Dtype>, bp::bases<Solver<Dtype> >,
+  bp::class_<AdaDeltaSolver<Dtype>, bp::bases<SGDSolver<Dtype> >,
     shared_ptr<AdaDeltaSolver<Dtype> >, boost::noncopyable>(
         "AdaDeltaSolver", bp::init<string>());
-  bp::class_<AdamSolver<Dtype>, bp::bases<Solver<Dtype> >,
+  bp::class_<AdamSolver<Dtype>, bp::bases<SGDSolver<Dtype> >,
     shared_ptr<AdamSolver<Dtype> >, boost::noncopyable>(
         "AdamSolver", bp::init<string>());
 

From 1bdcb74ea0d0acc7beb729ad7f01b0e5e44528a5 Mon Sep 17 00:00:00 2001
From: Valentin Tolmer <valentin.tolmer@gmail.com>
Date: Tue, 21 Jun 2016 17:12:57 -0700
Subject: [PATCH 135/144] [pycaffe] test solver update

---
 python/caffe/test/test_solver.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/python/caffe/test/test_solver.py b/python/caffe/test/test_solver.py
index f618fded8cd..50c9d5412d7 100644
--- a/python/caffe/test/test_solver.py
+++ b/python/caffe/test/test_solver.py
@@ -38,6 +38,17 @@ def test_solve(self):
         self.solver.solve()
         self.assertEqual(self.solver.iter, 100)
 
+    def test_apply_update(self):
+        net = self.solver.net
+        data = net.layers[1].blobs[0].data[...]
+        # Reset the weights of that layer to 0
+        data[...] = 0
+        net.layers[1].blobs[0].diff[...] = 1
+        # Apply the update, the initial learning rate should be 0.01
+        self.solver.apply_update()
+        # Check that the new weights are -0.01, with a precision of 1e-7
+        self.assertTrue((data - -0.01 * np.ones(data.shape)).max() < 1e-7)
+
     def test_net_memory(self):
         """Check that nets survive after the solver is destroyed."""
 

From 72e953ba151642850ef8ac4c4e7bf4181660be51 Mon Sep 17 00:00:00 2001
From: Yuda Liu <liu_yuda@163.com>
Date: Sun, 8 Jul 2018 15:46:43 +0800
Subject: [PATCH 136/144] Update inner_product_layer.cpp

---
 src/caffe/layers/inner_product_layer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index e65349f0055..57fdbe1fac2 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -42,7 +42,7 @@ void InnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
         this->layer_param_.inner_product_param().weight_filler()));
     weight_filler->Fill(this->blobs_[0].get());
-    // If necessary, intiialize and fill the bias term
+    // If necessary, initialize and fill the bias term
     if (bias_term_) {
       vector<int> bias_shape(1, N_);
       this->blobs_[1].reset(new Blob<Dtype>(bias_shape));

From f019d0dfe86f49d1140961f8c7dec22130c83154 Mon Sep 17 00:00:00 2001
From: Kuang Fangjun <csukuangfj@gmail.com>
Date: Thu, 12 Jul 2018 16:36:06 +0800
Subject: [PATCH 137/144] fix typos and some minor fixes.

---
 cmake/Modules/FindMKL.cmake         | 2 +-
 include/caffe/net.hpp               | 6 +++---
 include/caffe/solver.hpp            | 4 ++--
 include/caffe/util/signal_handler.h | 2 +-
 python/caffe/_caffe.cpp             | 2 +-
 src/caffe/layers/pooling_layer.cpp  | 2 +-
 src/caffe/net.cpp                   | 8 ++++----
 src/caffe/proto/caffe.proto         | 6 +++---
 src/caffe/solver.cpp                | 6 +++---
 src/caffe/util/signal_handler.cpp   | 2 +-
 10 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
index 5ab93b2d6b6..ef0c3bf1c64 100644
--- a/cmake/Modules/FindMKL.cmake
+++ b/cmake/Modules/FindMKL.cmake
@@ -9,7 +9,7 @@
 # This module defines the following variables:
 #
 #   MKL_FOUND            : True mkl is found
-#   MKL_INCLUDE_DIR      : unclude directory
+#   MKL_INCLUDE_DIR      : include directory
 #   MKL_LIBRARIES        : the libraries to link against.
 
 
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index d3c9306e9cf..143d5d28883 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -111,9 +111,9 @@ class Net {
    *        another Net.
    */
   void CopyTrainedLayersFrom(const NetParameter& param);
-  void CopyTrainedLayersFrom(const string trained_filename);
-  void CopyTrainedLayersFromBinaryProto(const string trained_filename);
-  void CopyTrainedLayersFromHDF5(const string trained_filename);
+  void CopyTrainedLayersFrom(const string& trained_filename);
+  void CopyTrainedLayersFromBinaryProto(const string& trained_filename);
+  void CopyTrainedLayersFromHDF5(const string& trained_filename);
   /// @brief Writes the net to a proto.
   void ToProto(NetParameter* param, bool write_diff = false) const;
   /// @brief Writes the net to an HDF5 file.
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index 75560f9fd08..7a0d7777f2d 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -55,7 +55,7 @@ class Solver {
   // The main entry of the solver function. In default, iter will be zero. Pass
   // in a non-zero iter number to resume training for a pre-trained net.
   virtual void Solve(const char* resume_file = NULL);
-  inline void Solve(const string resume_file) { Solve(resume_file.c_str()); }
+  inline void Solve(const string& resume_file) { Solve(resume_file.c_str()); }
   void Step(int iters);
   // The Restore method simply dispatches to one of the
   // RestoreSolverStateFrom___ protected methods. You should implement these
@@ -98,7 +98,7 @@ class Solver {
   virtual void ApplyUpdate() = 0;
 
  protected:
-  string SnapshotFilename(const string extension);
+  string SnapshotFilename(const string& extension);
   string SnapshotToBinaryProto();
   string SnapshotToHDF5();
   // The test routine
diff --git a/include/caffe/util/signal_handler.h b/include/caffe/util/signal_handler.h
index fb84c65bd2e..5246332581e 100644
--- a/include/caffe/util/signal_handler.h
+++ b/include/caffe/util/signal_handler.h
@@ -8,7 +8,7 @@ namespace caffe {
 
 class SignalHandler {
  public:
-  // Contructor. Specify what action to take when a signal is received.
+  // Constructor. Specify what action to take when a signal is received.
   SignalHandler(SolverAction::Enum SIGINT_action,
                 SolverAction::Enum SIGHUP_action);
   ~SignalHandler();
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index 9e7f61402c4..82bf21e6e16 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -416,7 +416,7 @@ BOOST_PYTHON_MODULE(_caffe) {
     .def("reshape", &Net<Dtype>::Reshape)
     .def("clear_param_diffs", &Net<Dtype>::ClearParamDiffs)
     // The cast is to select a particular overload.
-    .def("copy_from", static_cast<void (Net<Dtype>::*)(const string)>(
+    .def("copy_from", static_cast<void (Net<Dtype>::*)(const string&)>(
         &Net<Dtype>::CopyTrainedLayersFrom))
     .def("share_with", &Net<Dtype>::ShareTrainedLayersWith)
     .add_property("_blob_loss_weights", bp::make_function(
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index 90897db0f45..1fa78904ea8 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -132,7 +132,7 @@ void PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   const int top_count = top[0]->count();
   // We'll output the mask to top[1] if it's of size >1.
   const bool use_top_mask = top.size() > 1;
-  int* mask = NULL;  // suppress warnings about uninitalized variables
+  int* mask = NULL;  // suppress warnings about uninitialized variables
   Dtype* top_mask = NULL;
   // Different pooling methods. We explicitly do the switch outside the for
   // loop to save time, although this results in more code.
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 353c2f95b9e..94c0220f172 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -164,7 +164,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   // loss.  We can skip backward computation for blobs that don't contribute
   // to the loss.
   // Also checks if all bottom blobs don't need backward computation (possible
-  // because the skip_propagate_down param) and so we can skip bacward
+  // because the skip_propagate_down param) and so we can skip backward
   // computation for the entire layer
   set<string> blobs_under_loss;
   set<string> blobs_skip_backp;
@@ -768,7 +768,7 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
 }
 
 template <typename Dtype>
-void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) {
+void Net<Dtype>::CopyTrainedLayersFrom(const string& trained_filename) {
   if (H5Fis_hdf5(trained_filename.c_str())) {
     CopyTrainedLayersFromHDF5(trained_filename);
   } else {
@@ -778,14 +778,14 @@ void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) {
 
 template <typename Dtype>
 void Net<Dtype>::CopyTrainedLayersFromBinaryProto(
-    const string trained_filename) {
+    const string& trained_filename) {
   NetParameter param;
   ReadNetParamsFromBinaryFileOrDie(trained_filename, &param);
   CopyTrainedLayersFrom(param);
 }
 
 template <typename Dtype>
-void Net<Dtype>::CopyTrainedLayersFromHDF5(const string trained_filename) {
+void Net<Dtype>::CopyTrainedLayersFromHDF5(const string& trained_filename) {
   hid_t file_hid = H5Fopen(trained_filename.c_str(), H5F_ACC_RDONLY,
                            H5P_DEFAULT);
   CHECK_GE(file_hid, 0) << "Couldn't open " << trained_filename;
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index b9bb3f4dffe..2f8dffc0e1b 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -187,7 +187,7 @@ message SolverParameter {
 
   optional int32 snapshot = 14 [default = 0]; // The snapshot interval
   // The prefix for the snapshot.
-  // If not set then is replaced by prototxt file path without extention.
+  // If not set then is replaced by prototxt file path without extension.
   // If is set to directory then is augmented by prototxt file name
   // without extention.
   optional string snapshot_prefix = 15;
@@ -248,8 +248,8 @@ message SolverParameter {
 
   // Path to caffemodel file(s) with pretrained weights to initialize finetuning.
   // Tha same as command line --weights parameter for caffe train command.
-  // If command line --weights parameter if specified, it has higher priority
-  // and owerwrites this one(s).
+  // If command line --weights parameter is specified, it has higher priority
+  // and overwrites this one(s).
   // If --snapshot command line parameter is specified, this one(s) are ignored.
   // If several model files are expected, they can be listed in a one 
   // weights parameter separated by ',' (like in a command string) or
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index bf27beeed41..842312e0b76 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -78,7 +78,7 @@ template <typename Dtype>
 void Solver<Dtype>::InitTrainNet() {
   const int num_train_nets = param_.has_net() + param_.has_net_param() +
       param_.has_train_net() + param_.has_train_net_param();
-  const string& field_names = "net, net_param, train_net, train_net_param";
+  const string field_names = "net, net_param, train_net, train_net_param";
   CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net "
       << "using one of these fields: " << field_names;
   CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than "
@@ -447,13 +447,13 @@ void Solver<Dtype>::CheckSnapshotWritePermissions() {
     } else {
       LOG(FATAL) << "Cannot write to snapshot prefix '"
           << param_.snapshot_prefix() << "'.  Make sure "
-          << "that the directory exists and is writeable.";
+          << "that the directory exists and is writable.";
     }
   }
 }
 
 template <typename Dtype>
-string Solver<Dtype>::SnapshotFilename(const string extension) {
+string Solver<Dtype>::SnapshotFilename(const string& extension) {
   return param_.snapshot_prefix() + "_iter_" + caffe::format_int(iter_)
     + extension;
 }
diff --git a/src/caffe/util/signal_handler.cpp b/src/caffe/util/signal_handler.cpp
index 5d764ec524f..9658fb390ea 100644
--- a/src/caffe/util/signal_handler.cpp
+++ b/src/caffe/util/signal_handler.cpp
@@ -48,7 +48,7 @@ namespace {
   void UnhookHandler() {
     if (already_hooked_up) {
       struct sigaction sa;
-      // Setup the sighub handler
+      // Setup the sighup handler
       sa.sa_handler = SIG_DFL;
       // Restart the system call, if at all possible
       sa.sa_flags = SA_RESTART;

From 43536289bd770f7bd29ce407361d78601b9ff2f0 Mon Sep 17 00:00:00 2001
From: Pavel Grunt <pavel.grunt@innovatrics.com>
Date: Tue, 14 Aug 2018 15:22:08 +0200
Subject: [PATCH 138/144] python: Set gpu device id before setting gpu mode

Otherwise caffe allocates some memory on GPU#0
---
 docs/tutorial/interfaces.md | 2 +-
 python/train.py             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/tutorial/interfaces.md b/docs/tutorial/interfaces.md
index b5a4f1ad069..2578af5d4de 100644
--- a/docs/tutorial/interfaces.md
+++ b/docs/tutorial/interfaces.md
@@ -129,8 +129,8 @@ Use CPU:
 
 Use GPU and specify its gpu_id:
 
-    caffe.set_mode_gpu();
     caffe.set_device(gpu_id);
+    caffe.set_mode_gpu();
 
 #### Create a network and access its layers and blobs
 
diff --git a/python/train.py b/python/train.py
index 5897f5dcb90..14a38b8cef1 100644
--- a/python/train.py
+++ b/python/train.py
@@ -63,8 +63,8 @@ def show_time():
 
 
 def solve(proto, snapshot, gpus, timing, uid, rank):
-    caffe.set_mode_gpu()
     caffe.set_device(gpus[rank])
+    caffe.set_mode_gpu()
     caffe.set_solver_count(len(gpus))
     caffe.set_solver_rank(rank)
     caffe.set_multiprocess(True)

From 7f4f5d2563abaecb5ab983d2bac4daf21e5b3a98 Mon Sep 17 00:00:00 2001
From: Harm Berntsen <harm.berntsen@nedap.com>
Date: Mon, 18 Jan 2016 11:41:14 +0100
Subject: [PATCH 139/144] Add clip layer

---
 include/caffe/layers/clip_layer.hpp  | 75 ++++++++++++++++++++++++++++
 src/caffe/layer_factory.cpp          |  1 +
 src/caffe/layers/clip_layer.cpp      | 50 +++++++++++++++++++
 src/caffe/layers/clip_layer.cu       | 66 ++++++++++++++++++++++++
 src/caffe/proto/caffe.proto          |  9 +++-
 src/caffe/test/test_neuron_layer.cpp | 33 ++++++++++++
 6 files changed, 233 insertions(+), 1 deletion(-)
 create mode 100644 include/caffe/layers/clip_layer.hpp
 create mode 100644 src/caffe/layers/clip_layer.cpp
 create mode 100644 src/caffe/layers/clip_layer.cu

diff --git a/include/caffe/layers/clip_layer.hpp b/include/caffe/layers/clip_layer.hpp
new file mode 100644
index 00000000000..2788193e3ec
--- /dev/null
+++ b/include/caffe/layers/clip_layer.hpp
@@ -0,0 +1,75 @@
+#ifndef CAFFE_CLIP_LAYER_HPP_
+#define CAFFE_CLIP_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Clip: @f$ y = \max(min, \min(max, x)) @f$.
+ */
+template <typename Dtype>
+class ClipLayer : public NeuronLayer<Dtype> {
+ public:
+  /**
+   * @param param provides ClipParameter clip_param,
+   *     with ClipLayer options:
+   *   - min
+   *   - max
+   */
+  explicit ClipLayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "Clip"; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the computed outputs @f$
+   *        y = \max(min, \min(max, x))
+   *      @f$
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the clipped inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to computed outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$; Backward fills their diff with
+   *      gradients @f$
+   *        \frac{\partial E}{\partial x} = \left\{
+   *        \begin{array}{lr}
+   *            0 & \mathrm{if} \; x < min \vee x > max \\
+   *            \frac{\partial E}{\partial y} & \mathrm{if} \; x \ge min \wedge x \le max
+   *        \end{array} \right.
+   *      @f$
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_CLIP_LAYER_HPP_
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index 9f9026b1dde..d9984431ace 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -7,6 +7,7 @@
 
 #include "caffe/layer.hpp"
 #include "caffe/layer_factory.hpp"
+#include "caffe/layers/clip_layer.hpp"
 #include "caffe/layers/conv_layer.hpp"
 #include "caffe/layers/deconv_layer.hpp"
 #include "caffe/layers/lrn_layer.hpp"
diff --git a/src/caffe/layers/clip_layer.cpp b/src/caffe/layers/clip_layer.cpp
new file mode 100644
index 00000000000..76387011fa3
--- /dev/null
+++ b/src/caffe/layers/clip_layer.cpp
@@ -0,0 +1,50 @@
+#include <algorithm>
+#include <vector>
+#include "caffe/layers/clip_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ClipLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  const int count = bottom[0]->count();
+
+  Dtype min = this->layer_param_.clip_param().min();
+  Dtype max = this->layer_param_.clip_param().max();
+
+  for (int i = 0; i < count; ++i) {
+    top_data[i] = std::max(min, std::min(bottom_data[i], max));
+  }
+}
+
+template <typename Dtype>
+void ClipLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const int count = bottom[0]->count();
+
+    Dtype min = this->layer_param_.clip_param().min();
+    Dtype max = this->layer_param_.clip_param().max();
+
+    for (int i = 0; i < count; ++i) {
+      bottom_diff[i] = top_diff[i] * (
+              bottom_data[i] >= min && bottom_data[i] <= max);
+    }
+  }
+}
+
+
+#ifdef CPU_ONLY
+STUB_GPU(ClipLayer);
+#endif
+
+INSTANTIATE_CLASS(ClipLayer);
+REGISTER_LAYER_CLASS(Clip);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/clip_layer.cu b/src/caffe/layers/clip_layer.cu
new file mode 100644
index 00000000000..f780447fbcf
--- /dev/null
+++ b/src/caffe/layers/clip_layer.cu
@@ -0,0 +1,66 @@
+#include <vector>
+#include "caffe/layers/clip_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+__global__ void ClipForward(const int n, const float* in, float* out,
+    float p_min, float p_max) {
+  CUDA_KERNEL_LOOP(index, n) {
+    out[index] = fmaxf(p_min, fminf(in[index], p_max));
+  }
+}
+
+__global__ void ClipForward(const int n, const double* in, double* out,
+    double p_min, double p_max) {
+  CUDA_KERNEL_LOOP(index, n) {
+    out[index] = fmax(p_min, fmin(in[index], p_max));
+  }
+}
+
+template <typename Dtype>
+void ClipLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  Dtype p_min = this->layer_param_.clip_param().min();
+  Dtype p_max = this->layer_param_.clip_param().max();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  ClipForward<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, bottom_data, top_data, p_min, p_max);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template <typename Dtype>
+__global__ void ClipBackward(const int n, const Dtype* in_diff,
+    const Dtype* in_data, Dtype* out_diff, Dtype p_min, Dtype p_max) {
+  CUDA_KERNEL_LOOP(index, n) {
+    out_diff[index] = in_diff[index] * (
+            in_data[index] >= p_min && in_data[index] <= p_max);
+  }
+}
+
+template <typename Dtype>
+void ClipLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    Dtype p_min = this->layer_param_.clip_param().min();
+    Dtype p_max = this->layer_param_.clip_param().max();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    ClipBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, top_diff, bottom_data, bottom_diff, p_min, p_max);
+    CUDA_POST_KERNEL_CHECK;
+  }
+}
+
+
+INSTANTIATE_LAYER_GPU_FUNCS(ClipLayer);
+
+
+}  // namespace caffe
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index f784aa9600c..5c235c6f87c 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -322,7 +322,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 148 (last added: swish_param)
+// LayerParameter next available layer-specific ID: 149 (last added: clip_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -378,6 +378,7 @@ message LayerParameter {
   optional ArgMaxParameter argmax_param = 103;
   optional BatchNormParameter batch_norm_param = 139;
   optional BiasParameter bias_param = 141;
+  optional ClipParameter clip_param = 148;
   optional ConcatParameter concat_param = 104;
   optional ContrastiveLossParameter contrastive_loss_param = 105;
   optional ConvolutionParameter convolution_param = 106;
@@ -505,6 +506,12 @@ message ArgMaxParameter {
   optional int32 axis = 3;
 }
 
+// Message that stores parameters used by ClipLayer
+message ClipParameter {
+  required float min = 1;
+  required float max = 2;
+}
+
 message ConcatParameter {
   // The axis along which to concatenate -- may be negative to index from the
   // end (e.g., -1 for the last axis).  Other axes must have the
diff --git a/src/caffe/test/test_neuron_layer.cpp b/src/caffe/test/test_neuron_layer.cpp
index 83d80fcd895..5865e08e552 100644
--- a/src/caffe/test/test_neuron_layer.cpp
+++ b/src/caffe/test/test_neuron_layer.cpp
@@ -10,6 +10,7 @@
 
 #include "caffe/layers/absval_layer.hpp"
 #include "caffe/layers/bnll_layer.hpp"
+#include "caffe/layers/clip_layer.hpp"
 #include "caffe/layers/dropout_layer.hpp"
 #include "caffe/layers/elu_layer.hpp"
 #include "caffe/layers/exp_layer.hpp"
@@ -206,6 +207,38 @@ TYPED_TEST(NeuronLayerTest, TestAbsGradient) {
       this->blob_top_vec_);
 }
 
+TYPED_TEST(NeuronLayerTest, TestClip) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "clip_param { min: -1, max: 2 }", &layer_param));
+  ClipLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+  const Dtype* top_data = this->blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_GE(top_data[i], -1);
+    EXPECT_LE(top_data[i], 2);
+    EXPECT_TRUE(bottom_data[i] > -1 || top_data[i] == -1);
+    EXPECT_TRUE(bottom_data[i] < 2 || top_data[i] == 2);
+    EXPECT_TRUE(!(bottom_data[i] >= -1 && bottom_data[i] <= 2)
+            || top_data[i] == bottom_data[i]);
+  }
+}
+
+TYPED_TEST(NeuronLayerTest, TestClipGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "clip_param { min: -1, max: 2 }", &layer_param));
+  ClipLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
 TYPED_TEST(NeuronLayerTest, TestReLU) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;

From b56db67b974547d0f78f9dbee097ddabdf8f0a1d Mon Sep 17 00:00:00 2001
From: Noiredd <snowball91b@gmail.com>
Date: Fri, 30 Mar 2018 11:43:44 +0200
Subject: [PATCH 140/144] test case fix for Clip layer gradient

minor lint fixes
---
 src/caffe/layers/clip_layer.cpp      |  1 +
 src/caffe/layers/clip_layer.cu       |  1 +
 src/caffe/test/test_neuron_layer.cpp | 34 +++++++++++++++++++++++++---
 3 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/src/caffe/layers/clip_layer.cpp b/src/caffe/layers/clip_layer.cpp
index 76387011fa3..9d9a59673c0 100644
--- a/src/caffe/layers/clip_layer.cpp
+++ b/src/caffe/layers/clip_layer.cpp
@@ -1,5 +1,6 @@
 #include <algorithm>
 #include <vector>
+
 #include "caffe/layers/clip_layer.hpp"
 
 namespace caffe {
diff --git a/src/caffe/layers/clip_layer.cu b/src/caffe/layers/clip_layer.cu
index f780447fbcf..56f3be32d7d 100644
--- a/src/caffe/layers/clip_layer.cu
+++ b/src/caffe/layers/clip_layer.cu
@@ -1,4 +1,5 @@
 #include <vector>
+
 #include "caffe/layers/clip_layer.hpp"
 #include "caffe/util/math_functions.hpp"
 
diff --git a/src/caffe/test/test_neuron_layer.cpp b/src/caffe/test/test_neuron_layer.cpp
index 5865e08e552..d1ecc37b661 100644
--- a/src/caffe/test/test_neuron_layer.cpp
+++ b/src/caffe/test/test_neuron_layer.cpp
@@ -234,9 +234,37 @@ TYPED_TEST(NeuronLayerTest, TestClipGradient) {
   CHECK(google::protobuf::TextFormat::ParseFromString(
       "clip_param { min: -1, max: 2 }", &layer_param));
   ClipLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-2, 1e-3);
-  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
-      this->blob_top_vec_);
+  // Unfortunately, it might happen that an input value lands exactly within
+  // the discontinuity region of the Clip function. In this case the numeric
+  // gradient is likely to differ significantly (i.e. by a value larger than
+  // checker tolerance) from the computed gradient. To handle such cases, we
+  // eliminate such values from the input blob before the gradient check.
+  const Dtype epsilon = 1e-2;
+  const Dtype min_range_start = layer_param.clip_param().min() - epsilon;
+  const Dtype min_range_end   = layer_param.clip_param().min() + epsilon;
+  const Dtype max_range_start = layer_param.clip_param().max() - epsilon;
+  const Dtype max_range_end   = layer_param.clip_param().max() + epsilon;
+  // The input blob is owned by the NeuronLayerTest object, so we begin with
+  // creating a temporary blob and copying the input data there.
+  Blob<Dtype> temp_bottom;
+  temp_bottom.ReshapeLike(*this->blob_bottom_);
+  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+  Dtype* temp_data_mutable = temp_bottom.mutable_cpu_data();
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    if (bottom_data[i] >= min_range_start &&
+        bottom_data[i] <= min_range_end) {
+      temp_data_mutable[i] = bottom_data[i] - epsilon;
+    } else if (bottom_data[i] >= max_range_start &&
+               bottom_data[i] <= max_range_end) {
+      temp_data_mutable[i] = bottom_data[i] + epsilon;
+    } else {
+      temp_data_mutable[i] = bottom_data[i];
+    }
+  }
+  vector<Blob<Dtype>*> temp_bottom_vec;
+  temp_bottom_vec.push_back(&temp_bottom);
+  GradientChecker<Dtype> checker(epsilon, 1e-3);
+  checker.CheckGradientEltwise(&layer, temp_bottom_vec, this->blob_top_vec_);
 }
 
 TYPED_TEST(NeuronLayerTest, TestReLU) {

From 4ac6443908fde20429cfc2e4dd7b9cd4696ee415 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Przemys=C5=82aw=20Dolata?= <snowball91b@gmail.com>
Date: Fri, 17 Aug 2018 14:45:04 +0200
Subject: [PATCH 141/144] Clip layer documentation

---
 docs/tutorial/layers.md      |  1 +
 docs/tutorial/layers/clip.md | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)
 create mode 100644 docs/tutorial/layers/clip.md

diff --git a/docs/tutorial/layers.md b/docs/tutorial/layers.md
index 78a46f3a7ee..5036d4fd7c0 100644
--- a/docs/tutorial/layers.md
+++ b/docs/tutorial/layers.md
@@ -93,6 +93,7 @@ Layers:
 * [Log](layers/log.html) - f(x) = log(x).
 * [BNLL](layers/bnll.html) - f(x) = log(1 + exp(x)).
 * [Threshold](layers/threshold.html) - performs step function at user defined threshold.
+* [Clip](layers/clip.html) - clips a blob between a fixed minimum and maximum value.
 * [Bias](layers/bias.html) - adds a bias to a blob that can either be learned or fixed.
 * [Scale](layers/scale.html) - scales a blob by an amount that can either be learned or fixed.
 
diff --git a/docs/tutorial/layers/clip.md b/docs/tutorial/layers/clip.md
new file mode 100644
index 00000000000..d6a20f5f826
--- /dev/null
+++ b/docs/tutorial/layers/clip.md
@@ -0,0 +1,20 @@
+---
+title: Clip Layer
+---
+
+# Clip Layer
+
+* Layer type: `Clip`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ClipLayer.html)
+* Header: [`./include/caffe/layers/clip_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/clip_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/clip_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/clip_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/clip_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/clip_layer.cu)
+
+## Parameters
+
+* Parameters (`ClipParameter clip_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+￼{% highlight Protobuf %}
+￼{% include proto/ClipParameter.txt %}
+{% endhighlight %}

From d6d179a410c8e982255e2833c569cb1d465678a7 Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Fri, 1 Mar 2019 21:05:47 -0400
Subject: [PATCH 142/144] Updated Intel's branch description

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fe259535865..46abdb42e90 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ and step-by-step examples.
 
 ## Custom distributions
 
- - [Intel Caffe](https://github.com/BVLC/caffe/tree/intel) (Optimized for CPU and support for multi-node), in particular Xeon processors (HSW, BDW, SKX, Xeon Phi).
+ - [Intel Caffe](https://github.com/BVLC/caffe/tree/intel) (Optimized for CPU and support for multi-node), in particular Xeon processors.
 - [OpenCL Caffe](https://github.com/BVLC/caffe/tree/opencl) e.g. for AMD or Intel devices.
 - [Windows Caffe](https://github.com/BVLC/caffe/tree/windows)
 

From 04ab089db018a292ae48d51732dd6c66766b36b6 Mon Sep 17 00:00:00 2001
From: Cyprien Noel <cyprien.noel@gmail.com>
Date: Fri, 1 Mar 2019 21:13:01 -0400
Subject: [PATCH 143/144] Updated Intel's branch description

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 46abdb42e90..3705c55a0a4 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ and step-by-step examples.
 
 ## Custom distributions
 
- - [Intel Caffe](https://github.com/BVLC/caffe/tree/intel) (Optimized for CPU and support for multi-node), in particular Xeon processors.
+ - [Intel Caffe](https://github.com/BVLC/caffe/tree/intel) (Optimized for CPU and support for multi-node), in particular Intel® Xeon processors.
 - [OpenCL Caffe](https://github.com/BVLC/caffe/tree/opencl) e.g. for AMD or Intel devices.
 - [Windows Caffe](https://github.com/BVLC/caffe/tree/windows)
 

From 388bf12ab7826975d95dae1074afbd77b8920600 Mon Sep 17 00:00:00 2001
From: Tim Gates <tim.gates@iress.com>
Date: Thu, 12 Dec 2019 20:30:01 +1100
Subject: [PATCH 144/144] Fix simple typo: overrided -> overridden

Closes #6877
---
 scripts/cpp_lint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/cpp_lint.py b/scripts/cpp_lint.py
index b2016d4b6dd..fb44026718e 100755
--- a/scripts/cpp_lint.py
+++ b/scripts/cpp_lint.py
@@ -211,7 +211,7 @@
   'whitespace/todo'
   ]
 
-# The default state of the category filter. This is overrided by the --filter=
+# The default state of the category filter. This is overridden by the --filter=
 # flag. By default all errors are on, so only add here categories that should be
 # off by default (i.e., categories that must be enabled by the --filter= flags).
 # All entries here should start with a '-' or '+', as in the --filter= flag.