diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
index d78a3dc3455..c981f62f7a1 100644
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -1,19 +1,34 @@
-Please use the [caffe-users list](https://groups.google.com/forum/#!forum/caffe-users) for usage, installation, or modeling questions, or other requests for help.
-_Do not post such requests to Issues._ Doing so interferes with the development of Caffe.
+## Important - read before submitting
 
-Please read the [guidelines for contributing](https://github.com/BVLC/caffe/blob/master/CONTRIBUTING.md) before submitting this issue.
+*Please read the [guidelines for contributing](https://github.com/BVLC/caffe/blob/master/CONTRIBUTING.md) before submitting this issue!*
+
+*Please do not post installation, build, usage, or modeling questions, or other requests for help to Issues.*
+Use the [caffe-users list](https://groups.google.com/forum/#!forum/caffe-users) instead.
+This helps developers maintain a clear, uncluttered, and efficient view of the state of Caffe.
 
 ### Issue summary
 
 
 ### Steps to reproduce
 
-If you are having difficulty building Caffe or training a model, please ask the caffe-users mailing list. If you are reporting a build error that seems to be due to a bug in Caffe, please attach your build configuration (either Makefile.config or CMakeCache.txt) and the output of the make (or cmake) command.
 
-### Your system configuration
-Operating system:
-Compiler:
-CUDA version (if applicable):
-CUDNN version (if applicable):
-BLAS:
-Python or MATLAB version (for pycaffe and matcaffe respectively):
+### Tried solutions
+
+
+### System configuration
+
+* Operating system: 
+* Compiler: 
+* CUDA version (if applicable): 
+* CUDNN version (if applicable): 
+* BLAS: 
+* Python version (if using pycaffe): 
+* MATLAB version (if using matcaffe): 
+
+### Issue checklist
+
+- [ ] read the guidelines and removed the first paragraph
+- [ ] written a short summary and detailed steps to reproduce
+- [ ] explained how solutions to related problems failed (tick if found none)
+- [ ] filled system configuration
+- [ ] attached relevant logs/config files (tick if not applicable)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3056d75eaba..27d172f900b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,14 +10,15 @@ endif()
 project(Caffe C CXX)
 
 # ---[ Caffe version
-set(CAFFE_TARGET_VERSION "1.0.0-rc5" CACHE STRING "Caffe logical version")
-set(CAFFE_TARGET_SOVERSION "1.0.0-rc5" CACHE STRING "Caffe soname version")
+set(CAFFE_TARGET_VERSION "1.0.0" CACHE STRING "Caffe logical version")
+set(CAFFE_TARGET_SOVERSION "1.0.0" CACHE STRING "Caffe soname version")
 add_definitions(-DCAFFE_VERSION=${CAFFE_TARGET_VERSION})
 
 # ---[ Using cmake scripts and modules
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
 include(ExternalProject)
+include(GNUInstallDirs)
 
 include(cmake/Utils.cmake)
 include(cmake/Targets.cmake)
@@ -41,6 +42,9 @@ caffe_option(USE_LMDB "Build with lmdb" ON)
 caffe_option(ALLOW_LMDB_NOLOCK "Allow MDB_NOLOCK when reading LMDB files (only if necessary)" OFF)
 caffe_option(USE_OPENMP "Link with OpenMP (when your BLAS wants OpenMP and you get linker errors)" OFF)
 
+# This code is taken from https://github.com/sh1r0/caffe-android-lib
+caffe_option(USE_HDF5 "Build with hdf5" ON)
+
 # ---[ Dependencies
 include(cmake/Dependencies.cmake)
 
@@ -103,8 +107,19 @@ if(BUILD_python)
   add_dependencies(pytest pycaffe)
 endif()
 
+# ---[ uninstall target
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Uninstall.cmake.in
+    ${CMAKE_CURRENT_BINARY_DIR}/cmake/Uninstall.cmake
+    IMMEDIATE @ONLY)
+
+add_custom_target(uninstall
+    COMMAND ${CMAKE_COMMAND} -P
+    ${CMAKE_CURRENT_BINARY_DIR}/cmake/Uninstall.cmake)
+
 # ---[ Configuration summary
 caffe_print_configuration_summary()
 
 # ---[ Export configs generation
 caffe_generate_export_configs()
+
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8cd5e56ca49..45f7e186e4f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,21 +1,63 @@
 # Contributing
 
+Below you will find a collection of guidelines for submitting issues as well as contributing code to the Caffe repository.
+Please read those before starting an issue or a pull request.
+
 ## Issues
 
 Specific Caffe design and development issues, bugs, and feature requests are maintained by GitHub Issues.
 
-_Please do not post usage, installation, or modeling questions, or other requests for help to Issues._
-Use the [caffe-users list](https://groups.google.com/forum/#!forum/caffe-users) instead. This helps developers maintain a clear, uncluttered, and efficient view of the state of Caffe.
-
-When reporting a bug, it's most helpful to provide the following information, where applicable:
+*Please do not post installation, build, usage, or modeling questions, or other requests for help to Issues.*
+Use the [caffe-users list](https://groups.google.com/forum/#!forum/caffe-users) instead.
+This helps developers maintain a clear, uncluttered, and efficient view of the state of Caffe.
+See the chapter [caffe-users](#caffe-users) below for guidance on posting to the users list.
 
-* What steps reproduce the bug?
-* Can you reproduce the bug using the latest [master](https://github.com/BVLC/caffe/tree/master), compiled with the `DEBUG` make option?
-* What hardware and operating system/distribution are you running?
+When reporting an issue, it's most helpful to provide the following information, where applicable:
+* How does the problem look like and what steps reproduce it?
+* Can you reproduce it using the latest [master](https://github.com/BVLC/caffe/tree/master), compiled with the `DEBUG` make option?
+* What hardware and software are you running? In particular:
+	* GPU make and model, if relevant,
+	* operating system/distribution,
+	* compiler; please also post which version (for example, with GCC run `gcc --version` to check),
+	* CUDA version, if applicable (run `nvcc --version` to check),
+	* cuDNN version, if applicable (version number is stored in `cudnn.h`, look for lines containing `CUDNN_MAJOR`, `CUDNN_MINOR` and `CUDNN_PATCHLEVEL`),
+	* BLAS library,
+	* Python version, if relevant,
+	* MATLAB version, if relevant.
+* **What have you already tried** to solve the problem? How did it fail? Are there any other issues related to yours?
+* If this is not a build-related issue, does your installation pass `make runtest`?
 * If the bug is a crash, provide the backtrace (usually printed by Caffe; always obtainable with `gdb`).
+* If you are reporting a build error that seems to be due to a bug in Caffe, please attach your build configuration (either Makefile.config or CMakeCache.txt) and the output of the make (or cmake) command.
+
+If only a small portion of the code/log is relevant to your issue, you may paste it directly into the post, preferably using Markdown syntax for code block: triple backtick ( \`\`\` ) to open/close a block.
+In other cases (multiple files, or long files), please **attach** them to the post - this greatly improves readability.
+
+If the problem arises during a complex operation (e.g. large script using pycaffe, long network prototxt), please reduce the example to the minimal size that still causes the error.
+Also, minimize influence of external modules, data etc. - this way it will be easier for others to understand and reproduce your issue, and eventually help you.
+Sometimes you will find the root cause yourself in the process.
 
 Try to give your issue a title that is succinct and specific. The devs will rename issues as needed to keep track of them.
 
+## Caffe-users
+
+Before you post to the [caffe-users list](https://groups.google.com/forum/#!forum/caffe-users), make sure you look for existing solutions.
+The Caffe community has encountered and found solutions to countless problems - benefit from the collective experience.
+Recommended places to look:
+* the [users list](https://groups.google.com/forum/#!forum/caffe-users) itself,
+* [`caffe`](https://stackoverflow.com/questions/tagged/caffe) tag on StackOverflow,
+* [GitHub issues](https://github.com/BVLC/caffe/issues) tracker (some problems have been answered there),
+* the public [wiki](https://github.com/BVLC/caffe/wiki),
+* the official [documentation](http://caffe.berkeleyvision.org/).
+
+Found a post/issue with your exact problem, but with no answer?
+Don't just leave a "me too" message - provide the details of your case.
+Problems with more available information are easier to solve and attract good attention.
+
+When posting to the list, make sure you provide as much relevant information as possible - recommendations for an issue report (see above) are a good starting point.  
+*Please make it very clear which version of Caffe you are using, especially if it is a fork not maintained by BVLC.*
+
+Formatting recommendations hold: paste short logs/code fragments into the post (use fixed-width text for them), **attach** long logs or multiple files.
+
 ## Pull Requests
 
 Caffe welcomes all contributions.
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 8db66ea82c6..3fd767812e9 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -1,6 +1,6 @@
 # Contributors
 
-Caffe is developed by a core set of BVLC members and the open-source community.
+Caffe is developed by a core set of BAIR members and the open-source community.
 
 We thank all of our [contributors](https://github.com/BVLC/caffe/graphs/contributors)!
 
diff --git a/Makefile b/Makefile
index 77900b69b97..b7660e852d6 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,7 @@ LIB_BUILD_DIR := $(BUILD_DIR)/lib
 STATIC_NAME := $(LIB_BUILD_DIR)/lib$(LIBRARY_NAME).a
 DYNAMIC_VERSION_MAJOR 		:= 1
 DYNAMIC_VERSION_MINOR 		:= 0
-DYNAMIC_VERSION_REVISION 	:= 0-rc5
+DYNAMIC_VERSION_REVISION 	:= 0
 DYNAMIC_NAME_SHORT := lib$(LIBRARY_NAME).so
 #DYNAMIC_SONAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR)
 DYNAMIC_VERSIONED_NAME_SHORT := $(DYNAMIC_NAME_SHORT).$(DYNAMIC_VERSION_MAJOR).$(DYNAMIC_VERSION_MINOR).$(DYNAMIC_VERSION_REVISION)
@@ -178,11 +178,13 @@ ifneq ($(CPU_ONLY), 1)
 	LIBRARIES := cudart cublas curand
 endif
 
-LIBRARIES += glog gflags protobuf boost_system boost_filesystem m hdf5_hl hdf5
+LIBRARIES += glog gflags protobuf boost_system boost_filesystem m
 
 # handle IO dependencies
 USE_LEVELDB ?= 1
 USE_LMDB ?= 1
+# This code is taken from https://github.com/sh1r0/caffe-android-lib
+USE_HDF5 ?= 1
 USE_OPENCV ?= 1
 
 ifeq ($(USE_LEVELDB), 1)
@@ -191,6 +193,10 @@ endif
 ifeq ($(USE_LMDB), 1)
 	LIBRARIES += lmdb
 endif
+# This code is taken from https://github.com/sh1r0/caffe-android-lib
+ifeq ($(USE_HDF5), 1)
+	LIBRARIES += hdf5_hl hdf5
+endif
 ifeq ($(USE_OPENCV), 1)
 	LIBRARIES += opencv_core opencv_highgui opencv_imgproc
 
@@ -347,6 +353,10 @@ ifeq ($(ALLOW_LMDB_NOLOCK), 1)
 	COMMON_FLAGS += -DALLOW_LMDB_NOLOCK
 endif
 endif
+# This code is taken from https://github.com/sh1r0/caffe-android-lib
+ifeq ($(USE_HDF5), 1)
+	COMMON_FLAGS += -DUSE_HDF5
+endif
 
 # CPU-only configuration
 ifeq ($(CPU_ONLY), 1)
@@ -577,7 +587,7 @@ $(STATIC_NAME): $(OBJS) | $(LIB_BUILD_DIR)
 	@ echo AR -o $@
 	$(Q)ar rcs $@ $(OBJS)
 
-$(BUILD_DIR)/%.o: %.cpp | $(ALL_BUILD_DIRS)
+$(BUILD_DIR)/%.o: %.cpp $(PROTO_GEN_HEADER) | $(ALL_BUILD_DIRS)
 	@ echo CXX $<
 	$(Q)$(CXX) $< $(CXXFLAGS) -c -o $@ 2> $@.$(WARNS_EXT) \
 		|| (cat $@.$(WARNS_EXT); exit 1)
@@ -641,7 +651,7 @@ $(PROTO_BUILD_DIR)/%.pb.cc $(PROTO_BUILD_DIR)/%.pb.h : \
 $(PY_PROTO_BUILD_DIR)/%_pb2.py : $(PROTO_SRC_DIR)/%.proto \
 		$(PY_PROTO_INIT) | $(PY_PROTO_BUILD_DIR)
 	@ echo PROTOC \(python\) $<
-	$(Q)protoc --proto_path=$(PROTO_SRC_DIR) --python_out=$(PY_PROTO_BUILD_DIR) $<
+	$(Q)protoc --proto_path=src --python_out=python $<
 
 $(PY_PROTO_INIT): | $(PY_PROTO_BUILD_DIR)
 	touch $(PY_PROTO_INIT)
@@ -694,6 +704,6 @@ $(DISTRIBUTE_DIR): all py | $(DISTRIBUTE_SUBDIRS)
 	install -m 644 $(DYNAMIC_NAME) $(DISTRIBUTE_DIR)/lib
 	cd $(DISTRIBUTE_DIR)/lib; rm -f $(DYNAMIC_NAME_SHORT);   ln -s $(DYNAMIC_VERSIONED_NAME_SHORT) $(DYNAMIC_NAME_SHORT)
 	# add python - it's not the standard way, indeed...
-	cp -r python $(DISTRIBUTE_DIR)/python
+	cp -r python $(DISTRIBUTE_DIR)/
 
 -include $(DEPS)
diff --git a/Makefile.config.example b/Makefile.config.example
index d552b38a97c..24ca632783a 100644
--- a/Makefile.config.example
+++ b/Makefile.config.example
@@ -11,6 +11,8 @@
 # USE_OPENCV := 0
 # USE_LEVELDB := 0
 # USE_LMDB := 0
+# This code is taken from https://github.com/sh1r0/caffe-android-lib
+# USE_HDF5 := 0
 
 # uncomment to allow MDB_NOLOCK when reading LMDB files (only if necessary)
 #	You should not set this flag if you will be reading LMDBs with any
@@ -33,6 +35,7 @@ CUDA_DIR := /usr/local/cuda
 # CUDA architecture setting: going with all of them.
 # For CUDA < 6.0, comment the *_50 through *_61 lines for compatibility.
 # For CUDA < 8.0, comment the *_60 and *_61 lines for compatibility.
+# For CUDA >= 9.0, comment the *_20 and *_21 lines for compatibility.
 CUDA_ARCH := -gencode arch=compute_20,code=sm_20 \
 		-gencode arch=compute_20,code=sm_21 \
 		-gencode arch=compute_30,code=sm_30 \
diff --git a/README.md b/README.md
index 44b9e62c157..3705c55a0a4 100644
--- a/README.md
+++ b/README.md
@@ -4,17 +4,25 @@
 [![License](https://img.shields.io/badge/license-BSD-blue.svg)](LICENSE)
 
 Caffe is a deep learning framework made with expression, speed, and modularity in mind.
-It is developed by the Berkeley Vision and Learning Center ([BVLC](http://bvlc.eecs.berkeley.edu)) and community contributors.
+It is developed by Berkeley AI Research ([BAIR](http://bair.berkeley.edu))/The Berkeley Vision and Learning Center (BVLC) and community contributors.
 
 Check out the [project site](http://caffe.berkeleyvision.org) for all the details like
 
 - [DIY Deep Learning for Vision with Caffe](https://docs.google.com/presentation/d/1UeKXVgRvvxg9OUdh_UiC5G71UMscNPlvArsWER41PsU/edit#slide=id.p)
 - [Tutorial Documentation](http://caffe.berkeleyvision.org/tutorial/)
-- [BVLC reference models](http://caffe.berkeleyvision.org/model_zoo.html) and the [community model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo)
+- [BAIR reference models](http://caffe.berkeleyvision.org/model_zoo.html) and the [community model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo)
 - [Installation instructions](http://caffe.berkeleyvision.org/installation.html)
 
 and step-by-step examples.
 
+## Custom distributions
+
+ - [Intel Caffe](https://github.com/BVLC/caffe/tree/intel) (Optimized for CPU and support for multi-node), in particular Intel® Xeon processors.
+- [OpenCL Caffe](https://github.com/BVLC/caffe/tree/opencl) e.g. for AMD or Intel devices.
+- [Windows Caffe](https://github.com/BVLC/caffe/tree/windows)
+
+## Community
+
 [![Join the chat at https://gitter.im/BVLC/caffe](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/BVLC/caffe?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 
 Please join the [caffe-users group](https://groups.google.com/forum/#!forum/caffe-users) or [gitter chat](https://gitter.im/BVLC/caffe) to ask questions and talk about methods and models.
@@ -25,7 +33,7 @@ Happy brewing!
 ## License and Citation
 
 Caffe is released under the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE).
-The BVLC reference models are released for unrestricted use.
+The BAIR/BVLC reference models are released for unrestricted use.
 
 Please cite Caffe in your publications if it helps your research:
 
diff --git a/cmake/ConfigGen.cmake b/cmake/ConfigGen.cmake
index ad91f542104..69889c243b2 100644
--- a/cmake/ConfigGen.cmake
+++ b/cmake/ConfigGen.cmake
@@ -24,6 +24,18 @@ function(caffe_generate_export_configs)
     set(HAVE_CUDA FALSE)
   endif()
 
+  set(HDF5_IMPORTED OFF)
+  foreach(_lib ${HDF5_LIBRARIES} ${HDF5_HL_LIBRARIES})
+    if(TARGET ${_lib})
+      set(HDF5_IMPORTED ON)
+    endif()
+  endforeach()
+
+  # This code is taken from https://github.com/sh1r0/caffe-android-lib
+  if(USE_HDF5)
+    list(APPEND Caffe_DEFINITIONS -DUSE_HDF5)
+  endif()
+
   if(NOT HAVE_CUDNN)
     set(HAVE_CUDNN FALSE)
   endif()
@@ -33,7 +45,7 @@ function(caffe_generate_export_configs)
   configure_file("cmake/Templates/CaffeConfig.cmake.in" "${PROJECT_BINARY_DIR}/CaffeConfig.cmake" @ONLY)
 
   # Add targets to the build-tree export set
-  export(TARGETS caffe proto FILE "${PROJECT_BINARY_DIR}/CaffeTargets.cmake")
+  export(TARGETS caffe caffeproto FILE "${PROJECT_BINARY_DIR}/CaffeTargets.cmake")
   export(PACKAGE Caffe)
 
   # ---[ Configure install-tree CaffeConfig.cmake file ]---
diff --git a/cmake/Cuda.cmake b/cmake/Cuda.cmake
index b2b19e8b669..e03feabffcb 100644
--- a/cmake/Cuda.cmake
+++ b/cmake/Cuda.cmake
@@ -109,6 +109,12 @@ function(caffe_select_nvcc_arch_flags out_variable)
   set(__nvcc_flags "")
   set(__nvcc_archs_readable "")
 
+  string(COMPARE LESS "${CUDA_VERSION}" "9.0" iscudaolderthan90)
+  if(NOT iscudaolderthan90)
+    string(REPLACE "21(20)" "" __cuda_arch_bin "${__cuda_arch_bin}")
+    string(REPLACE "20" "" __cuda_arch_bin "${__cuda_arch_bin}")
+  endif()
+
   # Tell NVCC to add binaries for the specified GPUs
   foreach(__arch ${__cuda_arch_bin})
     if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
@@ -232,7 +238,7 @@ endfunction()
 ################################################################################################
 
 find_package(CUDA 5.5 QUIET)
-find_cuda_helper_libs(curand)  # cmake 2.8.7 compartibility which doesn't search for curand
+find_cuda_helper_libs(curand)  # cmake 2.8.7 compatibility which doesn't search for curand
 
 if(NOT CUDA_FOUND)
   return()
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 02c81525bce..ca2e3ad9e5e 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -5,7 +5,7 @@ set(Caffe_DEFINITIONS "")
 set(Caffe_COMPILE_OPTIONS "")
 
 # ---[ Boost
-find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem)
+find_package(Boost 1.54 REQUIRED COMPONENTS system thread filesystem)
 list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${Boost_INCLUDE_DIRS})
 list(APPEND Caffe_LINKER_LIBS PUBLIC ${Boost_LIBRARIES})
 
@@ -47,6 +47,14 @@ find_package(HDF5 COMPONENTS HL REQUIRED)
 list(APPEND Caffe_INCLUDE_DIRS PUBLIC ${HDF5_INCLUDE_DIRS})
 list(APPEND Caffe_LINKER_LIBS PUBLIC ${HDF5_LIBRARIES} ${HDF5_HL_LIBRARIES})
 
+# This code is taken from https://github.com/sh1r0/caffe-android-lib
+if(USE_HDF5)
+  find_package(HDF5 COMPONENTS HL REQUIRED)
+  include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR})
+  list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES} ${HDF5_HL_LIBRARIES})
+  add_definitions(-DUSE_HDF5)
+endif()
+
 # ---[ LMDB
 if(USE_LMDB)
   find_package(LMDB REQUIRED)
diff --git a/cmake/Misc.cmake b/cmake/Misc.cmake
index 9dd2609b36a..fcb246472f0 100644
--- a/cmake/Misc.cmake
+++ b/cmake/Misc.cmake
@@ -32,9 +32,10 @@ endif()
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOLEAN "Use link paths for shared library rpath")
 set(CMAKE_MACOSX_RPATH TRUE)
 
-list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES ${CMAKE_INSTALL_PREFIX}/lib __is_systtem_dir)
+list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES
+     ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR} __is_systtem_dir)
 if(${__is_systtem_dir} STREQUAL -1)
-  set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib)
+  set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR})
 endif()
 
 # ---[ Funny target
diff --git a/cmake/Modules/FindAtlas.cmake b/cmake/Modules/FindAtlas.cmake
index 9c665a47bd5..7ffa6393bbc 100644
--- a/cmake/Modules/FindAtlas.cmake
+++ b/cmake/Modules/FindAtlas.cmake
@@ -28,7 +28,7 @@ find_path(Atlas_CLAPACK_INCLUDE_DIR NAMES clapack.h PATHS ${Atlas_INCLUDE_SEARCH
 
 find_library(Atlas_CBLAS_LIBRARY NAMES  ptcblas_r ptcblas cblas_r cblas       PATHS ${Atlas_LIB_SEARCH_PATHS})
 find_library(Atlas_BLAS_LIBRARY NAMES   atlas_r   atlas                       PATHS ${Atlas_LIB_SEARCH_PATHS})
-find_library(Atlas_LAPACK_LIBRARY NAMES lapack alapack_r alapack lapack_atlas PATHS ${Atlas_LIB_SEARCH_PATHS})
+find_library(Atlas_LAPACK_LIBRARY NAMES lapack alapack_r alapack lapack_atlas atllapack PATHS ${Atlas_LIB_SEARCH_PATHS})
 
 set(LOOKED_FOR
   Atlas_CBLAS_INCLUDE_DIR
@@ -47,6 +47,6 @@ if(ATLAS_FOUND)
   set(Atlas_LIBRARIES ${Atlas_LAPACK_LIBRARY} ${Atlas_CBLAS_LIBRARY} ${Atlas_BLAS_LIBRARY})
   mark_as_advanced(${LOOKED_FOR})
 
-  message(STATUS "Found Atlas (include: ${Atlas_CBLAS_INCLUDE_DIR}, library: ${Atlas_BLAS_LIBRARY})")
+  message(STATUS "Found Atlas (include: ${Atlas_CBLAS_INCLUDE_DIR} library: ${Atlas_BLAS_LIBRARY} lapack: ${Atlas_LAPACK_LIBRARY}")
 endif(ATLAS_FOUND)
 
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
index 5ab93b2d6b6..ef0c3bf1c64 100644
--- a/cmake/Modules/FindMKL.cmake
+++ b/cmake/Modules/FindMKL.cmake
@@ -9,7 +9,7 @@
 # This module defines the following variables:
 #
 #   MKL_FOUND            : True mkl is found
-#   MKL_INCLUDE_DIR      : unclude directory
+#   MKL_INCLUDE_DIR      : include directory
 #   MKL_LIBRARIES        : the libraries to link against.
 
 
diff --git a/cmake/Modules/FindvecLib.cmake b/cmake/Modules/FindvecLib.cmake
index 8eaab59473c..4d44e613a00 100644
--- a/cmake/Modules/FindvecLib.cmake
+++ b/cmake/Modules/FindvecLib.cmake
@@ -12,11 +12,12 @@ endif()
 
 set(__veclib_include_suffix "Frameworks/vecLib.framework/Versions/Current/Headers")
 
+exec_program(xcode-select ARGS -print-path OUTPUT_VARIABLE CMAKE_XCODE_DEVELOPER_DIR)
 find_path(vecLib_INCLUDE_DIR vecLib.h
           DOC "vecLib include directory"
           PATHS /System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}
                 /System/Library/${__veclib_include_suffix}
-                /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
+                ${CMAKE_XCODE_DEVELOPER_DIR}/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
           NO_DEFAULT_PATH)
 
 include(FindPackageHandleStandardArgs)
diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake
index 8005b448707..72ea3230c50 100644
--- a/cmake/ProtoBuf.cmake
+++ b/cmake/ProtoBuf.cmake
@@ -78,7 +78,7 @@ function(caffe_protobuf_generate_cpp_py output_dir srcs_var hdrs_var python_var)
              "${output_dir}/${fil_we}_pb2.py"
       COMMAND ${CMAKE_COMMAND} -E make_directory "${output_dir}"
       COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --cpp_out    ${output_dir} ${_protoc_include} ${abs_fil}
-      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${output_dir} ${_protoc_include} ${abs_fil}
+      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJECT_BINARY_DIR}/include --proto_path ${PROJECT_SOURCE_DIR}/src ${_protoc_include} ${abs_fil}
       DEPENDS ${abs_fil}
       COMMENT "Running C++/Python protocol buffer compiler on ${fil}" VERBATIM )
   endforeach()
diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake
index ed8c25268db..40b8c2f2966 100644
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@@ -119,6 +119,8 @@ function(caffe_print_configuration_summary)
   caffe_status("  USE_LMDB          :   ${USE_LMDB}")
   caffe_status("  USE_NCCL          :   ${USE_NCCL}")
   caffe_status("  ALLOW_LMDB_NOLOCK :   ${ALLOW_LMDB_NOLOCK}")
+  # This code is taken from https://github.com/sh1r0/caffe-android-lib
+  caffe_status("  USE_HDF5          :   ${USE_HDF5}")
   caffe_status("")
   caffe_status("Dependencies:")
   caffe_status("  BLAS              : " APPLE THEN "Yes (vecLib)" ELSE "Yes (${BLAS})")
diff --git a/cmake/Templates/caffe_config.h.in b/cmake/Templates/caffe_config.h.in
index 45465b98305..2080c63df36 100644
--- a/cmake/Templates/caffe_config.h.in
+++ b/cmake/Templates/caffe_config.h.in
@@ -4,16 +4,9 @@
 /* Binaries directory */
 #define BINARY_FOLDER "${PROJECT_BINARY_DIR}"
 
+/* This is an absolute path so that we can run test from any build
+ * directory */
+#define ABS_TEST_DATA_DIR "${PROJECT_SOURCE_DIR}/src/caffe/test/test_data/"
+
 /* Test device */
 #define CUDA_TEST_DEVICE ${CUDA_TEST_DEVICE}
-
-/* Temporary (TODO: remove) */
-#if 1
-  #define CMAKE_SOURCE_DIR SOURCE_FOLDER "/src/"
-  #define EXAMPLES_SOURCE_DIR BINARY_FOLDER "/examples/"
-  #define CMAKE_EXT ".gen.cmake"
-#else
-  #define CMAKE_SOURCE_DIR "src/"
-  #define EXAMPLES_SOURCE_DIR "examples/"
-  #define CMAKE_EXT ""
-#endif
diff --git a/cmake/Uninstall.cmake.in b/cmake/Uninstall.cmake.in
new file mode 100644
index 00000000000..bb8e2964e46
--- /dev/null
+++ b/cmake/Uninstall.cmake.in
@@ -0,0 +1,26 @@
+if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+  message(FATAL_ERROR "Cannot find install manifest: @CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+
+if (NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set (CMAKE_INSTALL_PREFIX "@CMAKE_INSTALL_PREFIX@")
+endif ()
+ message(${CMAKE_INSTALL_PREFIX})
+
+file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
+string(REGEX REPLACE "\n" ";" files "${files}")
+foreach(file ${files})
+  message(STATUS "Uninstalling $ENV{DESTDIR}${file}")
+  if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    exec_program(
+      "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\""
+      OUTPUT_VARIABLE rm_out
+      RETURN_VALUE rm_retval
+      )
+    if(NOT "${rm_retval}" STREQUAL 0)
+      message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}")
+    endif(NOT "${rm_retval}" STREQUAL 0)
+  else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+    message(STATUS "File $ENV{DESTDIR}${file} does not exist.")
+  endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
+endforeach(file)
\ No newline at end of file
diff --git a/docker/cpu/Dockerfile b/docker/cpu/Dockerfile
index af6c03c6589..67e2e61bd57 100644
--- a/docker/cpu/Dockerfile
+++ b/docker/cpu/Dockerfile
@@ -28,7 +28,8 @@ ENV CAFFE_ROOT=/opt/caffe
 WORKDIR $CAFFE_ROOT
 
 # FIXME: use ARG instead of ENV once DockerHub supports this
-ENV CLONE_TAG=rc4
+# https://github.com/docker/hub-feedback/issues/460
+ENV CLONE_TAG=1.0
 
 RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
     pip install --upgrade pip && \
diff --git a/docker/gpu/Dockerfile b/docker/gpu/Dockerfile
index 0785b10f1e7..dcdbdf326fb 100644
--- a/docker/gpu/Dockerfile
+++ b/docker/gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
 LABEL maintainer caffe-maint@googlegroups.com
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -28,7 +28,8 @@ ENV CAFFE_ROOT=/opt/caffe
 WORKDIR $CAFFE_ROOT
 
 # FIXME: use ARG instead of ENV once DockerHub supports this
-ENV CLONE_TAG=rc4
+# https://github.com/docker/hub-feedback/issues/460
+ENV CLONE_TAG=1.0
 
 RUN git clone -b ${CLONE_TAG} --depth 1 https://github.com/BVLC/caffe.git . && \
     pip install --upgrade pip && \
diff --git a/docs/_layouts/default.html b/docs/_layouts/default.html
index b8efe60bc3b..3799e95afde 100644
--- a/docs/_layouts/default.html
+++ b/docs/_layouts/default.html
@@ -36,7 +36,7 @@
       <header>
         <h1 class="header"><a href="/">Caffe</a></h1>
         <p class="header">
-          Deep learning framework by the <a class="header name" href="http://bvlc.eecs.berkeley.edu/">BVLC</a>
+          Deep learning framework by <a class="header name" href="http://bair.berkeley.edu/">BAIR</a>
         </p>
         <p class="header">
           Created by
diff --git a/docs/development.md b/docs/development.md
index 107c2c3b281..36cd399512e 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -4,7 +4,7 @@ title: Developing and Contributing
 # Development and Contributing
 
 Caffe is developed with active participation of the community.<br>
-The [BVLC](http://bvlc.eecs.berkeley.edu/) brewers welcome all contributions!
+The [BAIR](http://bair.berkeley.edu/)/BVLC brewers welcome all contributions!
 
 The exact details of contributions are recorded by versioning and cited in our [acknowledgements](http://caffe.berkeleyvision.org/#acknowledgements).
 This method is impartial and always up-to-date.
@@ -37,7 +37,7 @@ We absolutely appreciate any contribution to this effort!
 
 The `master` branch receives all new development including community contributions.
 We try to keep it in a reliable state, but it is the bleeding edge, and things do get broken every now and then.
-BVLC maintainers will periodically make releases by marking stable checkpoints as tags and maintenance branches. [Past releases](https://github.com/BVLC/caffe/releases) are catalogued online.
+BAIR maintainers will periodically make releases by marking stable checkpoints as tags and maintenance branches. [Past releases](https://github.com/BVLC/caffe/releases) are catalogued online.
 
 #### Issues & Pull Request Protocol
 
@@ -116,5 +116,5 @@ To get a list of all options `googletest` provides, simply pass the `--help` fla
 
 - **Run `make lint` to check C++ code.**
 - Wrap lines at 80 chars.
-- Follow [Google C++ style](http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml) and [Google python style](http://google-styleguide.googlecode.com/svn/trunk/pyguide.html) + [PEP 8](http://legacy.python.org/dev/peps/pep-0008/).
+- Follow [Google C++ style](https://google.github.io/styleguide/cppguide.html) and [Google python style](https://google.github.io/styleguide/pyguide.html) + [PEP 8](http://legacy.python.org/dev/peps/pep-0008/).
 - Remember that “a foolish consistency is the hobgoblin of little minds,” so use your best judgement to write the clearest code for your particular case.
diff --git a/docs/index.md b/docs/index.md
index 932b3b58d1d..b633f7cfddc 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -5,7 +5,7 @@ title: Deep Learning Framework
 # Caffe
 
 Caffe is a deep learning framework made with expression, speed, and modularity in mind.
-It is developed by the Berkeley Vision and Learning Center ([BVLC](http://bvlc.eecs.berkeley.edu)) and by community contributors.
+It is developed by Berkeley AI Research ([BAIR](http://bair.berkeley.edu)) and by community contributors.
 [Yangqing Jia](http://daggerfs.com) created the project during his PhD at UC Berkeley.
 Caffe is released under the [BSD 2-Clause license](https://github.com/BVLC/caffe/blob/master/LICENSE).
 
@@ -23,21 +23,20 @@ Thanks to these contributors the framework tracks the state-of-the-art in both c
 
 **Speed** makes Caffe perfect for research experiments and industry deployment.
 Caffe can process **over 60M images per day** with a single NVIDIA K40 GPU\*.
-That's 1 ms/image for inference and 4 ms/image for learning.
-We believe that Caffe is the fastest convnet implementation available.
+That's 1 ms/image for inference and 4 ms/image for learning and more recent library versions and hardware are faster still.
+We believe that Caffe is among the fastest convnet implementations available.
 
 **Community**: Caffe already powers academic research projects, startup prototypes, and even large-scale industrial applications in vision, speech, and multimedia.
 Join our community of brewers on the [caffe-users group](https://groups.google.com/forum/#!forum/caffe-users) and [Github](https://github.com/BVLC/caffe/).
 
 <p class="footnote" markdown="1">
-\* With the ILSVRC2012-winning [SuperVision](http://www.image-net.org/challenges/LSVRC/2012/supervision.pdf) model and caching IO.
-Consult performance [details](/performance_hardware.html).
+\* With the ILSVRC2012-winning [SuperVision](http://www.image-net.org/challenges/LSVRC/2012/supervision.pdf) model and prefetching IO.
 </p>
 
 ## Documentation
 
-- [DIY Deep Learning for Vision with Caffe](https://docs.google.com/presentation/d/1UeKXVgRvvxg9OUdh_UiC5G71UMscNPlvArsWER41PsU/edit#slide=id.p)<br>
-Tutorial presentation.
+- [DIY Deep Learning for Vision with Caffe](https://docs.google.com/presentation/d/1UeKXVgRvvxg9OUdh_UiC5G71UMscNPlvArsWER41PsU/edit#slide=id.p) and [Caffe in a Day](https://docs.google.com/presentation/d/1HxGdeq8MPktHaPb-rlmYYQ723iWzq9ur6Gjo71YiG0Y/edit#slide=id.gc2fcdcce7_216_0)<br>
+Tutorial presentation of the framework and a full-day crash course.
 - [Tutorial Documentation](/tutorial)<br>
 Practical guide and framework reference.
 - [arXiv / ACM MM '14 paper](http://arxiv.org/abs/1408.5093)<br>
@@ -45,18 +44,13 @@ A 4-page report for the ACM Multimedia Open Source competition (arXiv:1408.5093v
 - [Installation instructions](/installation.html)<br>
 Tested on Ubuntu, Red Hat, OS X.
 * [Model Zoo](/model_zoo.html)<br>
-BVLC suggests a standard distribution format for Caffe models, and provides trained models.
+BAIR suggests a standard distribution format for Caffe models, and provides trained models.
 * [Developing & Contributing](/development.html)<br>
 Guidelines for development and contributing to Caffe.
 * [API Documentation](/doxygen/annotated.html)<br>
 Developer documentation automagically generated from code comments.
-
-### Examples
-
-{% assign examples = site.pages | where:'category','example' | sort: 'priority' %}
-{% for page in examples %}
-- <div><a href="{{page.url}}">{{page.title}}</a><br>{{page.description}}</div>
-{% endfor %}
+* [Benchmarking](https://docs.google.com/spreadsheets/d/1Yp4rqHpT7mKxOPbpzYeUfEFLnELDAgxSSBQKp5uKDGQ/edit#gid=0)<br>
+Comparison of inference and learning for different networks and GPUs.
 
 ### Notebook Examples
 
@@ -65,6 +59,13 @@ Developer documentation automagically generated from code comments.
 - <div><a href="http://nbviewer.ipython.org/github/BVLC/caffe/blob/master/{{page.original_path}}">{{page.title}}</a><br>{{page.description}}</div>
 {% endfor %}
 
+### Command Line Examples
+
+{% assign examples = site.pages | where:'category','example' | sort: 'priority' %}
+{% for page in examples %}
+- <div><a href="{{page.url}}">{{page.title}}</a><br>{{page.description}}</div>
+{% endfor %}
+
 ## Citing Caffe
 
 Please cite Caffe in your publications if it helps your research:
@@ -76,8 +77,7 @@ Please cite Caffe in your publications if it helps your research:
       Year = {2014}
     }
 
-If you do publish a paper where Caffe helped your research, we encourage you to update the [publications wiki](https://github.com/BVLC/caffe/wiki/Publications).
-Citations are also tracked automatically by [Google Scholar](http://scholar.google.com/scholar?oi=bibs&hl=en&cites=17333247995453974016).
+If you do publish a paper where Caffe helped your research, we encourage you to cite the framework for tracking by [Google Scholar](https://scholar.google.com/citations?view_op=view_citation&hl=en&citation_for_view=-ltRSM0AAAAJ:u5HHmVD_uO8C).
 
 ## Contacting Us
 
@@ -85,17 +85,12 @@ Join the [caffe-users group](https://groups.google.com/forum/#!forum/caffe-users
 
 Framework development discussions and thorough bug reports are collected on [Issues](https://github.com/BVLC/caffe/issues).
 
-Contact [caffe-dev](mailto:caffe-dev@googlegroups.com) if you have a confidential proposal for the framework *and the ability to act on it*.
-Requests for features, explanations, or personal help will be ignored; post to [caffe-users](https://groups.google.com/forum/#!forum/caffe-users) instead.
-
-The core Caffe developers offer [consulting services](mailto:caffe-coldpress@googlegroups.com) for appropriate projects.
-
 ## Acknowledgements
 
-The BVLC Caffe developers would like to thank NVIDIA for GPU donation, A9 and Amazon Web Services for a research grant in support of Caffe development and reproducible research in deep learning, and BVLC PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for guidance.
+The BAIR Caffe developers would like to thank NVIDIA for GPU donation, A9 and Amazon Web Services for a research grant in support of Caffe development and reproducible research in deep learning, and BAIR PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for guidance.
 
-The BVLC members who have contributed to Caffe are (alphabetical by first name):
-[Eric Tzeng](https://github.com/erictzeng), [Evan Shelhamer](http://imaginarynumber.net/), [Jeff Donahue](http://jeffdonahue.com/), [Jon Long](https://github.com/longjon), [Ross Girshick](http://www.cs.berkeley.edu/~rbg/), [Sergey Karayev](http://sergeykarayev.com/), [Sergio Guadarrama](http://www.eecs.berkeley.edu/~sguada/), and [Yangqing Jia](http://daggerfs.com/).
+The BAIR members who have contributed to Caffe are (alphabetical by first name):
+[Carl Doersch](http://www.carldoersch.com/), [Eric Tzeng](https://github.com/erictzeng), [Evan Shelhamer](http://imaginarynumber.net/), [Jeff Donahue](http://jeffdonahue.com/), [Jon Long](https://github.com/longjon), [Philipp Krähenbühl](http://www.philkr.net/), [Ronghang Hu](http://ronghanghu.com/), [Ross Girshick](http://www.cs.berkeley.edu/~rbg/), [Sergey Karayev](http://sergeykarayev.com/), [Sergio Guadarrama](http://www.eecs.berkeley.edu/~sguada/), [Takuya Narihira](https://github.com/tnarihi), and [Yangqing Jia](http://daggerfs.com/).
 
 The open-source community plays an important and growing role in Caffe's development.
 Check out the Github [project pulse](https://github.com/BVLC/caffe/pulse) for recent activity and the [contributors](https://github.com/BVLC/caffe/graphs/contributors) for the full list.
@@ -103,4 +98,4 @@ Check out the Github [project pulse](https://github.com/BVLC/caffe/pulse) for re
 We sincerely appreciate your interest and contributions!
 If you'd like to contribute, please read the [developing & contributing](development.html) guide.
 
-Yangqing would like to give a personal thanks to the NVIDIA Academic program for providing GPUs, [Oriol Vinyals](http://www1.icsi.berkeley.edu/~vinyals/) for discussions along the journey, and BVLC PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for advice.
+Yangqing would like to give a personal thanks to the NVIDIA Academic program for providing GPUs, [Oriol Vinyals](http://www1.icsi.berkeley.edu/~vinyals/) for discussions along the journey, and BAIR PI [Trevor Darrell](http://www.eecs.berkeley.edu/~trevor/) for advice.
diff --git a/docs/install_apt.md b/docs/install_apt.md
index bc1566b0be9..e361a92da3c 100644
--- a/docs/install_apt.md
+++ b/docs/install_apt.md
@@ -4,17 +4,50 @@ title: "Installation: Ubuntu"
 
 # Ubuntu Installation
 
+### For Ubuntu (>= 17.04)
+
+**Installing pre-compiled Caffe**
+
+Everything including caffe itself is packaged in 17.04 and higher versions.
+To install pre-compiled Caffe package, just do it by
+
+    sudo apt install caffe-cpu
+
+for CPU-only version, or
+
+    sudo apt install caffe-cuda
+
+for CUDA version. Note, the cuda version may break if your NVIDIA driver
+and CUDA toolkit are not installed by APT.
+
+[Package status of CPU-only version](https://launchpad.net/ubuntu/+source/caffe)
+
+[Package status of CUDA version](https://launchpad.net/ubuntu/+source/caffe-contrib)
+
+**Installing Caffe from source**
+
+We may install the dependencies by merely one line
+
+    sudo apt build-dep caffe-cpu        # dependencies for CPU-only version
+    sudo apt build-dep caffe-cuda       # dependencies for CUDA version
+
+It requires a `deb-src` line in your `sources.list`.
+Continue with [compilation](installation.html#compilation).
+
+### For Ubuntu (\< 17.04)
+
 **General dependencies**
 
     sudo apt-get install libprotobuf-dev libleveldb-dev libsnappy-dev libopencv-dev libhdf5-serial-dev protobuf-compiler
     sudo apt-get install --no-install-recommends libboost-all-dev
+    sudo apt-get install libgflags-dev libgoogle-glog-dev liblmdb-dev
 
 **CUDA**: Install by `apt-get` or the NVIDIA `.run` package.
 The NVIDIA package tends to follow more recent library and driver versions, but the installation is more manual.
 If installing from packages, install the library and latest driver separately; the driver bundled with the library is usually out-of-date.
 This can be skipped for CPU-only installation.
 
-**BLAS**: install ATLAS by `sudo apt-get install libatlas-base-dev` or install OpenBLAS or MKL for better CPU performance.
+**BLAS**: install ATLAS by `sudo apt-get install libatlas-base-dev` or install OpenBLAS by `sudo apt-get install libopenblas-dev` or MKL for better CPU performance.
 
 **Python** (optional): if you use the default Python you will need to `sudo apt-get install` the `python-dev` package to have the Python headers for building the pycaffe interface.
 
@@ -22,12 +55,6 @@ This can be skipped for CPU-only installation.
 
 CUDA 8 is required on Ubuntu 16.04.
 
-**Remaining dependencies, 14.04**
-
-Everything is packaged in 14.04.
-
-    sudo apt-get install libgflags-dev libgoogle-glog-dev liblmdb-dev
-
 **Remaining dependencies, 12.04**
 
 These dependencies need manual installation in 12.04.
diff --git a/docs/install_apt_debian.md b/docs/install_apt_debian.md
index 65fe70924e1..0a6a3b962e5 100644
--- a/docs/install_apt_debian.md
+++ b/docs/install_apt_debian.md
@@ -8,24 +8,28 @@ Caffe packages are available for several Debian versions, as shown in the
 following chart:
 
 ```
-Your Distro     |  CPU_ONLY  |  CUDA  |     Alias
+Your Distro     |  CPU_ONLY  |  CUDA  | Codename
 ----------------+------------+--------+-------------------
-Debian/stable   |     ✘      |   ✘    | Debian Jessie
-Debian/testing  |     ✔      |   ✔    | Debian Stretch/Sid
-Debian/unstable |     ✔      |   ✔    | Debian Sid
+Debian/oldstable|     ✘      |   ✘    | Jessie (8.0)
+Debian/stable   |     ✔      |   ✔    | Stretch (9.0)
+Debian/testing  |     ✔      |   ✔    | Buster
+Debian/unstable |     ✔      |   ✔    | Buster
 ```
 
 * `✘ ` You should take a look at [Ubuntu installation instruction](install_apt.html).
 
 * `✔ ` You can install caffe with a single command line following this guide.
 
-Last update: 2017-02-01
+* [Package status of CPU-only version](https://tracker.debian.org/pkg/caffe)
+
+* [Package status of CUDA version](https://tracker.debian.org/pkg/caffe-contrib)
+
+Last update: 2017-07-08
 
 ## Binary installation with APT
 
-Apart from the installation methods based on source, Debian/unstable
-and Debian/testing users can install pre-compiled Caffe packages from
-the official archive.
+Apart from the installation methods based on source, Debian users can install
+pre-compiled Caffe packages from the official archive with APT.
 
 Make sure that your `/etc/apt/sources.list` contains `contrib` and `non-free`
 sections if you want to install the CUDA version, for instance:
@@ -44,7 +48,8 @@ $ caffe                                              # command line interface wo
 $ python3 -c 'import caffe; print(caffe.__path__)'   # python3 interface working
 ```
 
-These Caffe packages should work for you out of box.
+These Caffe packages should work for you out of box. However, the CUDA version
+may break if your NVIDIA driver and CUDA toolkit are not installed with APT.
 
 #### Customizing caffe packages
 
@@ -96,18 +101,22 @@ Note, this requires a `deb-src` entry in your `/etc/apt/sources.list`.
 Some users may find their favorate compiler doesn't work with CUDA.
 
 ```
-CXX compiler |  CUDA 7.5  |  CUDA 8.0  |
--------------+------------+------------+-
-GCC-7        |     ?      |     ?      |
-GCC-6        |     ✘      |     ✘      |
-GCC-5        |     ✔ [1]  |     ✔      |
-CLANG-4.0    |     ?      |     ?      |
-CLANG-3.9    |     ✘      |     ✘      |
-CLANG-3.8    |     ?      |     ✔      |
+CXX compiler |  CUDA 7.5  |  CUDA 8.0  |  CUDA 9.0  |
+-------------+------------+------------+------------+
+GCC-8        |     ?      |     ?      |     ?      |
+GCC-7        |     ?      |     ?      |     ?      |
+GCC-6        |     ✘      |     ✘      |     ✔      |
+GCC-5        |     ✔ [1]  |     ✔      |     ✔      |
+-------------+------------+------------+------------+
+CLANG-4.0    |     ?      |     ?      |     ?      |
+CLANG-3.9    |     ✘      |     ✘      |     ✔      |
+CLANG-3.8    |     ?      |     ✔      |     ✔      |
 ```
 
 `[1]` CUDA 7.5 's `host_config.h` must be patched before working with GCC-5.
 
+`[2]` CUDA 9.0: https://devblogs.nvidia.com/parallelforall/cuda-9-features-revealed/
+
 BTW, please forget the GCC-4.X series, since its `libstdc++` ABI is not compatible with GCC-5's.
 You may encounter failure linking GCC-4.X object files against GCC-5 libraries.
 (See https://wiki.debian.org/GCC5 )
@@ -152,10 +161,3 @@ and hack the packaging scripts, then build your customized package.
 $ sudo apt install caffe-doc
 $ dpkg -L caffe-doc
 ```
-
-* Where can I find the Debian package status?
-
-```
-https://tracker.debian.org/pkg/caffe          (for the CPU_ONLY version)
-https://tracker.debian.org/pkg/caffe-contrib  (for the CUDA version)
-```
diff --git a/docs/installation.md b/docs/installation.md
index 2e558027678..c482285320a 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -17,7 +17,7 @@ The official Makefile and `Makefile.config` build are complemented by a [communi
 - [RHEL / CentOS / Fedora installation](install_yum.html)
 - [Windows](https://github.com/BVLC/caffe/tree/windows) *see the Windows branch led by Guillaume Dumont*
 - [OpenCL](https://github.com/BVLC/caffe/tree/opencl) *see the OpenCL branch led by Fabian Tschopp*
-- [AWS AMI](https://github.com/bitfusionio/amis/tree/master/awsmrkt-bfboost-ubuntu14-cuda75-caffe) *pre-configured for AWS*
+- [AWS AMI](https://aws.amazon.com/marketplace/pp/B01M0AXXQB) *official deep learning amazon machine image from AWS*
 
 **Overview**:
 
@@ -42,14 +42,14 @@ Optional dependencies:
 
 * [OpenCV](http://opencv.org/) >= 2.4 including 3.0
 * IO libraries: `lmdb`, `leveldb` (note: leveldb requires `snappy`)
-* cuDNN for GPU acceleration (v5)
+* cuDNN for GPU acceleration (v7)
 
 Pycaffe and Matcaffe interfaces have their own natural needs.
 
 * For Python Caffe:  `Python 2.7` or `Python 3.3+`, `numpy (>= 1.7)`, boost-provided `boost.python`
 * For MATLAB Caffe: MATLAB with the `mex` compiler.
 
-**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v5; older versions are supported in older Caffe.
+**cuDNN Caffe**: for fastest operation Caffe is accelerated by drop-in integration of [NVIDIA cuDNN](https://developer.nvidia.com/cudnn). To speed up your Caffe models, install cuDNN then uncomment the `USE_CUDNN := 1` flag in `Makefile.config` when installing Caffe. Acceleration is automatic. The current version is cuDNN v7; older versions are supported in older Caffe.
 
 **CPU-only Caffe**: for cold-brewed CPU-only Caffe uncomment the `CPU_ONLY := 1` flag in `Makefile.config` to configure and build Caffe without CUDA. This is helpful for cloud or cluster deployment.
 
@@ -80,7 +80,7 @@ The main requirements are `numpy` and `boost.python` (provided by boost). `panda
 
 You can install the dependencies with
 
-    for req in $(cat requirements.txt); do pip install $req; done
+    pip install -r requirements.txt
 
 but we suggest first installing the [Anaconda](https://store.continuum.io/cshop/anaconda/) Python distribution, which provides most of the necessary packages, as well as the `hdf5` library dependency.
 
diff --git a/docs/model_zoo.md b/docs/model_zoo.md
index 06dc0a49ec7..3f77e82572c 100644
--- a/docs/model_zoo.md
+++ b/docs/model_zoo.md
@@ -3,7 +3,7 @@ title: Model Zoo
 ---
 # Caffe Model Zoo
 
-Lots of researchers and engineers have made Caffe models for different tasks with all kinds of architectures and data.
+Lots of researchers and engineers have made Caffe models for different tasks with all kinds of architectures and data: check out the [model zoo](https://github.com/BVLC/caffe/wiki/Model-Zoo)!
 These models are learned and applied for problems ranging from simple regression, to large-scale visual classification, to Siamese networks for image similarity, to speech and robotics applications.
 
 To help share these models, we introduce the model zoo framework:
@@ -14,17 +14,17 @@ To help share these models, we introduce the model zoo framework:
 
 ## Where to get trained models
 
-First of all, we bundle BVLC-trained models for unrestricted, out of the box use.
+First of all, we bundle BAIR-trained models for unrestricted, out of the box use.
 <br>
-See the [BVLC model license](#bvlc-model-license) for details.
+See the [BAIR model license](#bair-model-license) for details.
 Each one of these can be downloaded by running `scripts/download_model_binary.py <dirname>` where `<dirname>` is specified below:
 
-- **BVLC Reference CaffeNet** in `models/bvlc_reference_caffenet`: AlexNet trained on ILSVRC 2012, with a minor variation from the version as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Jeff Donahue @jeffdonahue)
-- **BVLC AlexNet** in `models/bvlc_alexnet`: AlexNet trained on ILSVRC 2012, almost exactly as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Evan Shelhamer @shelhamer)
-- **BVLC Reference R-CNN ILSVRC-2013** in `models/bvlc_reference_rcnn_ilsvrc13`: pure Caffe implementation of [R-CNN](https://github.com/rbgirshick/rcnn) as described by Girshick et al. in CVPR 2014. (Trained by Ross Girshick @rbgirshick)
-- **BVLC GoogLeNet** in `models/bvlc_googlenet`: GoogLeNet trained on ILSVRC 2012, almost exactly as described in [Going Deeper with Convolutions](http://arxiv.org/abs/1409.4842) by Szegedy et al. in ILSVRC 2014. (Trained by Sergio Guadarrama @sguada)
+- **BAIR Reference CaffeNet** in `models/bvlc_reference_caffenet`: AlexNet trained on ILSVRC 2012, with a minor variation from the version as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Jeff Donahue @jeffdonahue)
+- **BAIR AlexNet** in `models/bvlc_alexnet`: AlexNet trained on ILSVRC 2012, almost exactly as described in [ImageNet classification with deep convolutional neural networks](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks) by Krizhevsky et al. in NIPS 2012. (Trained by Evan Shelhamer @shelhamer)
+- **BAIR Reference R-CNN ILSVRC-2013** in `models/bvlc_reference_rcnn_ilsvrc13`: pure Caffe implementation of [R-CNN](https://github.com/rbgirshick/rcnn) as described by Girshick et al. in CVPR 2014. (Trained by Ross Girshick @rbgirshick)
+- **BAIR GoogLeNet** in `models/bvlc_googlenet`: GoogLeNet trained on ILSVRC 2012, almost exactly as described in [Going Deeper with Convolutions](http://arxiv.org/abs/1409.4842) by Szegedy et al. in ILSVRC 2014. (Trained by Sergio Guadarrama @sguada)
 
-**Community models** made by Caffe users are posted to a publicly editable [wiki page](https://github.com/BVLC/caffe/wiki/Model-Zoo).
+**Community models** made by Caffe users are posted to a publicly editable [model zoo wiki page](https://github.com/BVLC/caffe/wiki/Model-Zoo).
 These models are subject to conditions of their respective authors such as citation and license.
 Thank you for sharing your models!
 
@@ -42,6 +42,8 @@ A caffe model is distributed as a directory containing:
     - License information.
 - [optional] Other helpful scripts.
 
+This simple format can be handled through bundled scripts or manually if need be.
+
 ### Hosting model info
 
 Github Gist is a good format for model info distribution because it can contain multiple files, is versionable, and has in-browser syntax highlighting and markdown rendering.
@@ -55,14 +57,14 @@ Downloading model info is done just as easily with `scripts/download_model_from_
 ### Hosting trained models
 
 It is up to the user where to host the `.caffemodel` file.
-We host our BVLC-provided models on our own server.
+We host our BAIR-provided models on our own server.
 Dropbox also works fine (tip: make sure that `?dl=1` is appended to the end of the URL).
 
 `scripts/download_model_binary.py <dirname>` downloads the `.caffemodel` from the URL specified in the `<dirname>/readme.md` frontmatter and confirms SHA1.
 
-## BVLC model license
+## BAIR model license
 
-The Caffe models bundled by the BVLC are released for unrestricted use.
+The Caffe models bundled by the BAIR are released for unrestricted use.
 
 These models are trained on data from the [ImageNet project](http://www.image-net.org/) and training data includes internet photos that may be subject to copyright.
 
diff --git a/docs/multigpu.md b/docs/multigpu.md
index d91acef980d..e04ebb0b7c8 100644
--- a/docs/multigpu.md
+++ b/docs/multigpu.md
@@ -13,7 +13,7 @@ The GPUs to be used for training can be set with the "-gpu" flag on the command
 # Hardware Configuration Assumptions
 
 The current implementation uses a tree reduction strategy.  e.g. if there are 4 GPUs in the system, 0:1, 2:3 will exchange gradients, then 0:2 (top of the tree) will exchange gradients, 0 will calculate
-updated model, 0\-\>2, and then 0\-\>1, 2\-\>3. 
+updated model, 0\-\>2, and then 0\-\>1, 2\-\>3.
 
 For best performance, P2P DMA access between devices is needed. Without P2P access, for example crossing PCIe root complex, data is copied through host and effective exchange bandwidth is greatly reduced.
 
@@ -23,4 +23,4 @@ Current implementation has a "soft" assumption that the devices being used are h
 
 # Scaling Performance
 
-Performance is **heavily** dependent on the PCIe topology of the system, the configuration of the neural network you are training, and the speed of each of the layers.  Systems like the DIGITS DevBox have an optimized PCIe topology (X99-E WS chipset).  In general, scaling on 2 GPUs tends to be ~1.8X on average for networks like AlexNet, CaffeNet, VGG, GoogleNet.  4 GPUs begins to have falloff in scaling.  Generally with "weak scaling" where the batchsize increases with the number of GPUs you will see 3.5x scaling or so.  With "strong scaling", the system can become communication bound, especially with layer performance optimizations like those in [cuDNNv3](http://nvidia.com/cudnn), and you will likely see closer to mid 2.x scaling in performance.  Networks that have heavy computation compared to the number of parameters tend to have the best scaling performance.
\ No newline at end of file
+Performance is **heavily** dependent on the PCIe topology of the system, the configuration of the neural network you are training, and the speed of each of the layers.  Systems like the DIGITS DevBox have an optimized PCIe topology (X99-E WS chipset).  In general, scaling on 2 GPUs tends to be ~1.8X on average for networks like AlexNet, CaffeNet, VGG, GoogleNet.  4 GPUs begins to have falloff in scaling.  Generally with "weak scaling" where the batchsize increases with the number of GPUs you will see 3.5x scaling or so.  With "strong scaling", the system can become communication bound, especially with layer performance optimizations like those in [cuDNNv3](http://nvidia.com/cudnn), and you will likely see closer to mid 2.x scaling in performance.  Networks that have heavy computation compared to the number of parameters tend to have the best scaling performance.
diff --git a/docs/performance_hardware.md b/docs/performance_hardware.md
deleted file mode 100644
index cdd4b361dea..00000000000
--- a/docs/performance_hardware.md
+++ /dev/null
@@ -1,73 +0,0 @@
----
-title: Performance and Hardware Configuration
----
-
-# Performance and Hardware Configuration
-
-To measure performance on different NVIDIA GPUs we use CaffeNet, the Caffe reference ImageNet model.
-
-For training, each time point is 20 iterations/minibatches of 256 images for 5,120 images total. For testing, a 50,000 image validation set is classified.
-
-**Acknowledgements**: BVLC members are very grateful to NVIDIA for providing several GPUs to conduct this research.
-
-## NVIDIA K40
-
-Performance is best with ECC off and boost clock enabled. While ECC makes a negligible difference in speed, disabling it frees ~1 GB of GPU memory.
-
-Best settings with ECC off and maximum clock speed in standard Caffe:
-
-* Training is 26.5 secs / 20 iterations (5,120 images)
-* Testing is 100 secs / validation set (50,000 images)
-
-Best settings with Caffe + [cuDNN acceleration](http://nvidia.com/cudnn):
-
-* Training is 19.2 secs / 20 iterations (5,120 images)
-* Testing is 60.7 secs / validation set (50,000 images)
-
-Other settings:
-
-* ECC on, max speed: training 26.7 secs / 20 iterations, test 101 secs / validation set
-* ECC on, default speed: training 31 secs / 20 iterations, test 117 secs / validation set
-* ECC off, default speed: training 31 secs / 20 iterations, test 118 secs / validation set
-
-### K40 configuration tips
-
-For maximum K40 performance, turn off ECC and boost the clock speed (at your own risk).
-
-To turn off ECC, do
-
-    sudo nvidia-smi -i 0 --ecc-config=0    # repeat with -i x for each GPU ID
-
-then reboot.
-
-Set the "persistence" mode of the GPU settings by
-
-    sudo nvidia-smi -pm 1
-
-and then set the clock speed with
-
-    sudo nvidia-smi -i 0 -ac 3004,875    # repeat with -i x for each GPU ID
-
-but note that this configuration resets across driver reloading / rebooting. Include these commands in a boot script to initialize these settings. For a simple fix, add these commands to `/etc/rc.local` (on Ubuntu).
-
-## NVIDIA Titan
-
-Training: 26.26 secs / 20 iterations (5,120 images).
-Testing: 100 secs / validation set (50,000 images).
-
-cuDNN Training: 20.25 secs / 20 iterations (5,120 images).
-cuDNN Testing: 66.3 secs / validation set (50,000 images).
-
-
-## NVIDIA K20
-
-Training: 36.0 secs / 20 iterations (5,120 images).
-Testing: 133 secs / validation set (50,000 images).
-
-## NVIDIA GTX 770
-
-Training: 33.0 secs / 20 iterations (5,120 images).
-Testing: 129 secs / validation set (50,000 images).
-
-cuDNN Training: 24.3 secs / 20 iterations (5,120 images).
-cuDNN Testing: 104 secs / validation set (50,000 images).
diff --git a/docs/tutorial/interfaces.md b/docs/tutorial/interfaces.md
index d7ff378239d..2578af5d4de 100644
--- a/docs/tutorial/interfaces.md
+++ b/docs/tutorial/interfaces.md
@@ -91,7 +91,7 @@ In MatCaffe, you can
 * Run for a certain number of iterations and give back control to Matlab
 * Intermingle arbitrary Matlab code with gradient steps
 
-An ILSVRC image classification demo is in caffe/matlab/demo/classification_demo.m (you need to download BVLC CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) to run it).
+An ILSVRC image classification demo is in caffe/matlab/demo/classification_demo.m (you need to download BAIR CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) to run it).
 
 ### Build MatCaffe
 
@@ -114,7 +114,7 @@ You can save your Matlab search PATH by running `savepath` so that you don't hav
 
 MatCaffe is very similar to PyCaffe in usage.
 
-Examples below shows detailed usages and assumes you have downloaded BVLC CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) and started `matlab` from caffe root folder.
+Examples below shows detailed usages and assumes you have downloaded BAIR CaffeNet from [Model Zoo](http://caffe.berkeleyvision.org/model_zoo.html) and started `matlab` from caffe root folder.
 
     model = './models/bvlc_reference_caffenet/deploy.prototxt';
     weights = './models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel';
@@ -129,8 +129,8 @@ Use CPU:
 
 Use GPU and specify its gpu_id:
 
-    caffe.set_mode_gpu();
     caffe.set_device(gpu_id);
+    caffe.set_mode_gpu();
 
 #### Create a network and access its layers and blobs
 
diff --git a/docs/tutorial/layers.md b/docs/tutorial/layers.md
index 2faacc5836d..5036d4fd7c0 100644
--- a/docs/tutorial/layers.md
+++ b/docs/tutorial/layers.md
@@ -87,12 +87,13 @@ Layers:
 * [ELU](layers/elu.html) - exponential linear rectification.
 * [Sigmoid](layers/sigmoid.html)
 * [TanH](layers/tanh.html)
-* [Absolute Value](layers/abs.html)
+* [Absolute Value](layers/absval.html)
 * [Power](layers/power.html) - f(x) = (shift + scale * x) ^ power.
 * [Exp](layers/exp.html) - f(x) = base ^ (shift + scale * x).
 * [Log](layers/log.html) - f(x) = log(x).
 * [BNLL](layers/bnll.html) - f(x) = log(1 + exp(x)).
 * [Threshold](layers/threshold.html) - performs step function at user defined threshold.
+* [Clip](layers/clip.html) - clips a blob between a fixed minimum and maximum value.
 * [Bias](layers/bias.html) - adds a bias to a blob that can either be learned or fixed.
 * [Scale](layers/scale.html) - scales a blob by an amount that can either be learned or fixed.
 
diff --git a/docs/tutorial/layers/clip.md b/docs/tutorial/layers/clip.md
new file mode 100644
index 00000000000..d6a20f5f826
--- /dev/null
+++ b/docs/tutorial/layers/clip.md
@@ -0,0 +1,20 @@
+---
+title: Clip Layer
+---
+
+# Clip Layer
+
+* Layer type: `Clip`
+* [Doxygen Documentation](http://caffe.berkeleyvision.org/doxygen/classcaffe_1_1ClipLayer.html)
+* Header: [`./include/caffe/layers/clip_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/clip_layer.hpp)
+* CPU implementation: [`./src/caffe/layers/clip_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/clip_layer.cpp)
+* CUDA GPU implementation: [`./src/caffe/layers/clip_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/clip_layer.cu)
+
+## Parameters
+
+* Parameters (`ClipParameter clip_param`)
+* From [`./src/caffe/proto/caffe.proto`](https://github.com/BVLC/caffe/blob/master/src/caffe/proto/caffe.proto):
+
+￼{% highlight Protobuf %}
+￼{% include proto/ClipParameter.txt %}
+{% endhighlight %}
diff --git a/docs/tutorial/layers/lrn.md b/docs/tutorial/layers/lrn.md
index 2fbef734663..f5e4829279d 100644
--- a/docs/tutorial/layers/lrn.md
+++ b/docs/tutorial/layers/lrn.md
@@ -14,7 +14,7 @@ title: Local Response Normalization (LRN)
         - `local_size` [default 5]: the number of channels to sum over (for cross channel LRN) or the side length of the square region to sum over (for within channel LRN)
         - `alpha` [default 1]: the scaling parameter (see below)
         - `beta` [default 5]: the exponent (see below)
-        - `norm_region` [default `ACROSS_CHANNELS`]: whether to sum over adjacent channels (`ACROSS_CHANNELS`) or nearby spatial locaitons (`WITHIN_CHANNEL`)
+        - `norm_region` [default `ACROSS_CHANNELS`]: whether to sum over adjacent channels (`ACROSS_CHANNELS`) or nearby spatial locations (`WITHIN_CHANNEL`)
 
 The local response normalization layer performs a kind of "lateral inhibition" by normalizing over local input regions. In `ACROSS_CHANNELS` mode, the local regions extend across nearby channels, but have no spatial extent (i.e., they have shape `local_size x 1 x 1`). In `WITHIN_CHANNEL` mode, the local regions extend spatially, but are in separate channels (i.e., they have shape `1 x local_size x local_size`). Each input value is divided by $$(1 + (\alpha/n) \sum_i x_i^2)^\beta$$, where $$n$$ is the size of each local region, and the sum is taken over the region centered at that value (zero padding is added where necessary).
 
diff --git a/docs/tutorial/layers/sigmoid.md b/docs/tutorial/layers/sigmoid.md
index 505318352c9..f18ac4b84ec 100644
--- a/docs/tutorial/layers/sigmoid.md
+++ b/docs/tutorial/layers/sigmoid.md
@@ -9,6 +9,16 @@ title: Sigmoid Layer
 * Header: [`./include/caffe/layers/sigmoid_layer.hpp`](https://github.com/BVLC/caffe/blob/master/include/caffe/layers/sigmoid_layer.hpp)
 * CPU implementation: [`./src/caffe/layers/sigmoid_layer.cpp`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_layer.cpp)
 * CUDA GPU implementation: [`./src/caffe/layers/sigmoid_layer.cu`](https://github.com/BVLC/caffe/blob/master/src/caffe/layers/sigmoid_layer.cu)
+* Example (from [`./examples/mnist/mnist_autoencoder.prototxt`](https://github.com/BVLC/caffe/blob/master/examples/mnist/mnist_autoencoder.prototxt)):
+
+      layer {
+        name: "encode1neuron"
+        bottom: "encode1"
+        top: "encode1neuron"
+        type: "Sigmoid"
+      }
+
+The `Sigmoid` layer computes `sigmoid(x)` for each element `x` in the bottom blob.
 
 ## Parameters
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a59e0df36b0..43bbcb83789 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -19,7 +19,8 @@ foreach(source_file ${examples_srcs})
   caffe_set_solution_folder(${name} examples)
 
   # install
-  install(TARGETS ${name} DESTINATION bin)
+  install(TARGETS ${name} DESTINATION ${CMAKE_INSTALL_BINDIR})
+
 
   if(UNIX OR APPLE)
     # Funny command to make tutorials work
diff --git a/examples/brewing-logreg.ipynb b/examples/brewing-logreg.ipynb
index c053b73b39f..0f87185a35b 100644
--- a/examples/brewing-logreg.ipynb
+++ b/examples/brewing-logreg.ipynb
@@ -73,12 +73,12 @@
     ")\n",
     "\n",
     "# Split into train and test\n",
-    "X, Xt, y, yt = sklearn.cross_validation.train_test_split(X, y)\n",
+    "X, Xt, y, yt = sklearn.model_selection.train_test_split(X, y)\n",
     "\n",
     "# Visualize sample of the data\n",
     "ind = np.random.permutation(X.shape[0])[:1000]\n",
     "df = pd.DataFrame(X[ind])\n",
-    "_ = pd.scatter_matrix(df, figsize=(9, 9), diagonal='kde', marker='o', s=40, alpha=.4, c=y[ind])"
+    "_ = pd.plotting.scatter_matrix(df, figsize=(9, 9), diagonal='kde', marker='o', s=40, alpha=.4, c=y[ind])"
    ]
   },
   {
@@ -111,7 +111,7 @@
     "%%timeit\n",
     "# Train and test the scikit-learn SGD logistic regression.\n",
     "clf = sklearn.linear_model.SGDClassifier(\n",
-    "    loss='log', n_iter=1000, penalty='l2', alpha=5e-4, class_weight='auto')\n",
+    "    loss='log', n_iter=1000, penalty='l2', alpha=5e-4, class_weight='balanced')\n",
     "\n",
     "clf.fit(X, y)\n",
     "yt_pred = clf.predict(Xt)\n",
diff --git a/examples/cifar10/cifar10_quick_solver.prototxt b/examples/cifar10/cifar10_quick_solver.prototxt
index 5de276f722f..14b4401ba16 100644
--- a/examples/cifar10/cifar10_quick_solver.prototxt
+++ b/examples/cifar10/cifar10_quick_solver.prototxt
@@ -20,7 +20,6 @@ display: 100
 max_iter: 4000
 # snapshot intermediate results
 snapshot: 4000
-snapshot_format: HDF5
 snapshot_prefix: "examples/cifar10/cifar10_quick"
 # solver mode: CPU or GPU
 solver_mode: GPU
diff --git a/examples/cifar10/train_full.sh b/examples/cifar10/train_full.sh
index 06ecc2dccb0..fe46e60d795 100755
--- a/examples/cifar10/train_full.sh
+++ b/examples/cifar10/train_full.sh
@@ -9,9 +9,9 @@ $TOOLS/caffe train \
 # reduce learning rate by factor of 10
 $TOOLS/caffe train \
     --solver=examples/cifar10/cifar10_full_solver_lr1.prototxt \
-    --snapshot=examples/cifar10/cifar10_full_iter_60000.solverstate.h5 $@
+    --snapshot=examples/cifar10/cifar10_full_iter_60000.solverstate $@
 
 # reduce learning rate by factor of 10
 $TOOLS/caffe train \
     --solver=examples/cifar10/cifar10_full_solver_lr2.prototxt \
-    --snapshot=examples/cifar10/cifar10_full_iter_65000.solverstate.h5 $@
+    --snapshot=examples/cifar10/cifar10_full_iter_65000.solverstate $@
diff --git a/examples/cifar10/train_quick.sh b/examples/cifar10/train_quick.sh
index d2b875340ee..257479e0d77 100755
--- a/examples/cifar10/train_quick.sh
+++ b/examples/cifar10/train_quick.sh
@@ -9,4 +9,4 @@ $TOOLS/caffe train \
 # reduce learning rate by factor of 10 after 8 epochs
 $TOOLS/caffe train \
   --solver=examples/cifar10/cifar10_quick_solver_lr1.prototxt \
-  --snapshot=examples/cifar10/cifar10_quick_iter_4000.solverstate.h5 $@
+  --snapshot=examples/cifar10/cifar10_quick_iter_4000.solverstate $@
diff --git a/examples/finetune_flickr_style/readme.md b/examples/finetune_flickr_style/readme.md
index 188dedf1b9a..dacfd01c8e1 100644
--- a/examples/finetune_flickr_style/readme.md
+++ b/examples/finetune_flickr_style/readme.md
@@ -9,7 +9,7 @@ priority: 5
 # Fine-tuning CaffeNet for Style Recognition on "Flickr Style" Data
 
 Fine-tuning takes an already learned model, adapts the architecture, and resumes training from the already learned model weights.
-Let's fine-tune the BVLC-distributed CaffeNet model on a different dataset, [Flickr Style](http://sergeykarayev.com/files/1311.3715v3.pdf), to predict image style instead of object category.
+Let's fine-tune the BAIR-distributed CaffeNet model on a different dataset, [Flickr Style](http://sergeykarayev.com/files/1311.3715v3.pdf), to predict image style instead of object category.
 
 ## Explanation
 
diff --git a/examples/web_demo/readme.md b/examples/web_demo/readme.md
index fe74b9ef7d3..e50c4f101ee 100644
--- a/examples/web_demo/readme.md
+++ b/examples/web_demo/readme.md
@@ -11,7 +11,7 @@ priority: 10
 ## Requirements
 
 The demo server requires Python with some dependencies.
-To make sure you have the dependencies, please run `pip install -r examples/web_demo/requirements.txt`, and also make sure that you've compiled the Python Caffe interface and that it is on your `PYTHONPATH` (see [installation instructions](/installation.html)).
+To make sure you have the dependencies, please run `pip install -r examples/web_demo/requirements.txt`, and also make sure that you've compiled the Python Caffe interface and that it is on your `PYTHONPATH` (see [installation instructions](http://caffe.berkeleyvision.org/installation.html)).
 
 Make sure that you have obtained the Reference CaffeNet Model and the ImageNet Auxiliary Data:
 
diff --git a/include/caffe/filler.hpp b/include/caffe/filler.hpp
index dad9ad46b3b..a44773619dc 100644
--- a/include/caffe/filler.hpp
+++ b/include/caffe/filler.hpp
@@ -108,9 +108,9 @@ class PositiveUnitballFiller : public Filler<Dtype> {
     caffe_rng_uniform<Dtype>(blob->count(), 0, 1, blob->mutable_cpu_data());
     // We expect the filler to not be called very frequently, so we will
     // just use a simple implementation
-    int dim = blob->count() / blob->num();
+    int dim = blob->count() / blob->shape(0);
     CHECK(dim);
-    for (int i = 0; i < blob->num(); ++i) {
+    for (int i = 0; i < blob->shape(0); ++i) {
       Dtype sum = 0;
       for (int j = 0; j < dim; ++j) {
         sum += data[i * dim + j];
@@ -147,8 +147,11 @@ class XavierFiller : public Filler<Dtype> {
       : Filler<Dtype>(param) {}
   virtual void Fill(Blob<Dtype>* blob) {
     CHECK(blob->count());
-    int fan_in = blob->count() / blob->num();
-    int fan_out = blob->count() / blob->channels();
+    int fan_in = blob->count() / blob->shape(0);
+    // Compatibility with ND blobs
+    int fan_out = blob->num_axes() > 1 ?
+                  blob->count() / blob->shape(1) :
+                  blob->count();
     Dtype n = fan_in;  // default to fan_in
     if (this->filler_param_.variance_norm() ==
         FillerParameter_VarianceNorm_AVERAGE) {
@@ -189,8 +192,11 @@ class MSRAFiller : public Filler<Dtype> {
       : Filler<Dtype>(param) {}
   virtual void Fill(Blob<Dtype>* blob) {
     CHECK(blob->count());
-    int fan_in = blob->count() / blob->num();
-    int fan_out = blob->count() / blob->channels();
+    int fan_in = blob->count() / blob->shape(0);
+    // Compatibility with ND blobs
+    int fan_out = blob->num_axes() > 1 ?
+                  blob->count() / blob->shape(1) :
+                  blob->count();
     Dtype n = fan_in;  // default to fan_in
     if (this->filler_param_.variance_norm() ==
         FillerParameter_VarianceNorm_AVERAGE) {
@@ -250,10 +256,10 @@ class BilinearFiller : public Filler<Dtype> {
     CHECK_EQ(blob->width(), blob->height()) << "Filter must be square";
     Dtype* data = blob->mutable_cpu_data();
     int f = ceil(blob->width() / 2.);
-    float c = (2 * f - 1 - f % 2) / (2. * f);
+    Dtype c = (blob->width() - 1) / (2. * f);
     for (int i = 0; i < blob->count(); ++i) {
-      float x = i % blob->width();
-      float y = (i / blob->width()) % blob->height();
+      Dtype x = i % blob->width();
+      Dtype y = (i / blob->width()) % blob->height();
       data[i] = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
     }
     CHECK_EQ(this->filler_param_.sparse(), -1)
diff --git a/include/caffe/layers/accuracy_layer.hpp b/include/caffe/layers/accuracy_layer.hpp
index a9ad3225149..dd2247b9e4d 100644
--- a/include/caffe/layers/accuracy_layer.hpp
+++ b/include/caffe/layers/accuracy_layer.hpp
@@ -68,6 +68,8 @@ class AccuracyLayer : public Layer<Dtype> {
    */
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
 
 
   /// @brief Not implemented -- AccuracyLayer cannot be used as a loss.
@@ -77,6 +79,8 @@ class AccuracyLayer : public Layer<Dtype> {
       if (propagate_down[i]) { NOT_IMPLEMENTED; }
     }
   }
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
   int label_axis_, outer_num_, inner_num_;
 
diff --git a/include/caffe/layers/base_data_layer.hpp b/include/caffe/layers/base_data_layer.hpp
index 21d3ada50d0..c8b6998c8f2 100644
--- a/include/caffe/layers/base_data_layer.hpp
+++ b/include/caffe/layers/base_data_layer.hpp
@@ -26,8 +26,6 @@ class BaseDataLayer : public Layer<Dtype> {
   // This method may not be overridden except by the BasePrefetchingDataLayer.
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
   // Data layers have no bottoms, so reshaping is trivial.
diff --git a/include/caffe/layers/clip_layer.hpp b/include/caffe/layers/clip_layer.hpp
new file mode 100644
index 00000000000..2788193e3ec
--- /dev/null
+++ b/include/caffe/layers/clip_layer.hpp
@@ -0,0 +1,75 @@
+#ifndef CAFFE_CLIP_LAYER_HPP_
+#define CAFFE_CLIP_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Clip: @f$ y = \max(min, \min(max, x)) @f$.
+ */
+template <typename Dtype>
+class ClipLayer : public NeuronLayer<Dtype> {
+ public:
+  /**
+   * @param param provides ClipParameter clip_param,
+   *     with ClipLayer options:
+   *   - min
+   *   - max
+   */
+  explicit ClipLayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param) {}
+
+  virtual inline const char* type() const { return "Clip"; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the computed outputs @f$
+   *        y = \max(min, \min(max, x))
+   *      @f$
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the clipped inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to computed outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$; Backward fills their diff with
+   *      gradients @f$
+   *        \frac{\partial E}{\partial x} = \left\{
+   *        \begin{array}{lr}
+   *            0 & \mathrm{if} \; x < min \vee x > max \\
+   *            \frac{\partial E}{\partial y} & \mathrm{if} \; x \ge min \wedge x \le max
+   *        \end{array} \right.
+   *      @f$
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_CLIP_LAYER_HPP_
diff --git a/include/caffe/layers/crop_layer.hpp b/include/caffe/layers/crop_layer.hpp
index c4fda1220c3..5219fa5cb5f 100644
--- a/include/caffe/layers/crop_layer.hpp
+++ b/include/caffe/layers/crop_layer.hpp
@@ -41,13 +41,15 @@ class CropLayer : public Layer<Dtype> {
   virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
-  vector<int> offsets;
+  Blob<int> offsets;
+  Blob<int> src_strides_;
+  Blob<int> dest_strides_;
 
  private:
   // Recursive copy function.
   void crop_copy(const vector<Blob<Dtype>*>& bottom,
                const vector<Blob<Dtype>*>& top,
-               const vector<int>& offsets,
+               const int* offsets,
                vector<int> indices,
                int cur_dim,
                const Dtype* src_data,
diff --git a/include/caffe/layers/cudnn_deconv_layer.hpp b/include/caffe/layers/cudnn_deconv_layer.hpp
new file mode 100644
index 00000000000..12799e5b8ef
--- /dev/null
+++ b/include/caffe/layers/cudnn_deconv_layer.hpp
@@ -0,0 +1,68 @@
+#ifndef CAFFE_CUDNN_DECONV_LAYER_HPP_
+#define CAFFE_CUDNN_DECONV_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/deconv_layer.hpp"
+
+namespace caffe {
+
+#ifdef USE_CUDNN
+/*
+ * @brief cuDNN implementation of DeConvolutionLayer.
+ *        Fallback to DeConvolutionLayer for CPU mode.
+ *
+ * cuDNN accelerates deconvolution through forward kernels for filtering and
+ * bias plus backward kernels for the gradient w.r.t. the filters, biases, and
+ * inputs. Caffe + cuDNN further speeds up the computation through forward
+ * parallelism across groups and backward parallelism across gradients.
+*/
+template <typename Dtype>
+class CuDNNDeconvolutionLayer : public DeconvolutionLayer<Dtype> {
+ public:
+  explicit CuDNNDeconvolutionLayer(const LayerParameter& param)
+    : DeconvolutionLayer<Dtype>(param), handles_setup_(false) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+                          const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+                       const vector<Blob<Dtype>*>& top);
+  virtual ~CuDNNDeconvolutionLayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+                           const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+                            const vector<bool>& propagate_down,
+                            const vector<Blob<Dtype>*>& bottom);
+
+  bool handles_setup_;
+  cudnnHandle_t* handle_;
+  cudaStream_t*  stream_;
+
+  // algorithms for forward and backwards convolutions
+  cudnnConvolutionFwdAlgo_t *fwd_algo_;
+  cudnnConvolutionBwdFilterAlgo_t *bwd_filter_algo_;
+  cudnnConvolutionBwdDataAlgo_t *bwd_data_algo_;
+
+  vector<cudnnTensorDescriptor_t> bottom_descs_, top_descs_;
+  cudnnTensorDescriptor_t bias_desc_;
+  cudnnFilterDescriptor_t filter_desc_;
+  vector<cudnnConvolutionDescriptor_t> conv_descs_;
+  int bottom_offset_, top_offset_, bias_offset_;
+
+  size_t *workspace_fwd_sizes_;
+  size_t *workspace_bwd_data_sizes_;
+  size_t *workspace_bwd_filter_sizes_;
+  size_t workspaceSizeInBytes;  // size of underlying storage
+  void *workspaceData;  // underlying storage
+  void **workspace;  // aliases into workspaceData
+};
+#endif
+
+}  // namespace caffe
+
+#endif  // CAFFE_CUDNN_DECONV_LAYER_HPP_
diff --git a/include/caffe/layers/data_layer.hpp b/include/caffe/layers/data_layer.hpp
index dec58180976..667a4ae43a5 100644
--- a/include/caffe/layers/data_layer.hpp
+++ b/include/caffe/layers/data_layer.hpp
@@ -20,8 +20,6 @@ class DataLayer : public BasePrefetchingDataLayer<Dtype> {
   virtual ~DataLayer();
   virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // DataLayer uses DataReader instead for sharing for parallelism
-  virtual inline bool ShareInParallel() const { return false; }
   virtual inline const char* type() const { return "Data"; }
   virtual inline int ExactNumBottomBlobs() const { return 0; }
   virtual inline int MinTopBlobs() const { return 1; }
diff --git a/include/caffe/layers/dummy_data_layer.hpp b/include/caffe/layers/dummy_data_layer.hpp
index 4180f1d01e4..13a63d47ec4 100644
--- a/include/caffe/layers/dummy_data_layer.hpp
+++ b/include/caffe/layers/dummy_data_layer.hpp
@@ -22,8 +22,6 @@ class DummyDataLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/euclidean_loss_layer.hpp b/include/caffe/layers/euclidean_loss_layer.hpp
index f564569e27a..24568c5496f 100644
--- a/include/caffe/layers/euclidean_loss_layer.hpp
+++ b/include/caffe/layers/euclidean_loss_layer.hpp
@@ -30,7 +30,7 @@ namespace caffe {
  * This can be used for least-squares regression tasks.  An InnerProductLayer
  * input to a EuclideanLossLayer exactly formulates a linear least squares
  * regression problem. With non-zero weight decay the problem becomes one of
- * ridge regression -- see src/caffe/test/test_sgd_solver.cpp for a concrete
+ * ridge regression -- see src/caffe/test/test_gradient_based_solver.cpp for a concrete
  * example wherein we check that the gradients computed for a Net with exactly
  * this structure match hand-computed gradient formulas for ridge regression.
  *
diff --git a/include/caffe/layers/hdf5_data_layer.hpp b/include/caffe/layers/hdf5_data_layer.hpp
index 650a3fb0c87..601b36c6b89 100644
--- a/include/caffe/layers/hdf5_data_layer.hpp
+++ b/include/caffe/layers/hdf5_data_layer.hpp
@@ -27,8 +27,6 @@ class HDF5DataLayer : public Layer<Dtype> {
   virtual ~HDF5DataLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/hdf5_output_layer.hpp b/include/caffe/layers/hdf5_output_layer.hpp
index 487d08fc06c..061e279d7a0 100644
--- a/include/caffe/layers/hdf5_output_layer.hpp
+++ b/include/caffe/layers/hdf5_output_layer.hpp
@@ -28,8 +28,6 @@ class HDF5OutputLayer : public Layer<Dtype> {
   virtual ~HDF5OutputLayer();
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/infogain_loss_layer.hpp b/include/caffe/layers/infogain_loss_layer.hpp
index 633f339a28e..3b3caa27c38 100644
--- a/include/caffe/layers/infogain_loss_layer.hpp
+++ b/include/caffe/layers/infogain_loss_layer.hpp
@@ -8,24 +8,26 @@
 #include "caffe/proto/caffe.pb.h"
 
 #include "caffe/layers/loss_layer.hpp"
+#include "caffe/layers/softmax_layer.hpp"
 
 namespace caffe {
 
 /**
- * @brief A generalization of MultinomialLogisticLossLayer that takes an
+ * @brief A generalization of SoftmaxWithLossLayer that takes an
  *        "information gain" (infogain) matrix specifying the "value" of all label
  *        pairs.
  *
- * Equivalent to the MultinomialLogisticLossLayer if the infogain matrix is the
+ * Equivalent to the SoftmaxWithLossLayer if the infogain matrix is the
  * identity.
  *
  * @param bottom input Blob vector (length 2-3)
  *   -# @f$ (N \times C \times H \times W) @f$
- *      the predictions @f$ \hat{p} @f$, a Blob with values in
- *      @f$ [0, 1] @f$ indicating the predicted probability of each of the
- *      @f$ K = CHW @f$ classes.  Each prediction vector @f$ \hat{p}_n @f$
- *      should sum to 1 as in a probability distribution: @f$
- *      \forall n \sum\limits_{k=1}^K \hat{p}_{nk} = 1 @f$.
+ *      the predictions @f$ x @f$, a Blob with values in
+ *      @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
+ *      the @f$ K = CHW @f$ classes. This layer maps these scores to a
+ *      probability distribution over classes using the softmax function
+ *      @f$ \hat{p}_{nk} = \exp(x_{nk}) /
+ *      \left[\sum_{k'} \exp(x_{nk'})\right] @f$ (see SoftmaxLayer).
  *   -# @f$ (N \times 1 \times 1 \times 1) @f$
  *      the labels @f$ l @f$, an integer-valued Blob with values
  *      @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
@@ -34,7 +36,7 @@ namespace caffe {
  *      (\b optional) the infogain matrix @f$ H @f$.  This must be provided as
  *      the third bottom blob input if not provided as the infogain_mat in the
  *      InfogainLossParameter. If @f$ H = I @f$, this layer is equivalent to the
- *      MultinomialLogisticLossLayer.
+ *      SoftmaxWithLossLayer.
  * @param top output Blob vector (length 1)
  *   -# @f$ (1 \times 1 \times 1 \times 1) @f$
  *      the computed infogain multinomial logistic loss: @f$ E =
@@ -60,6 +62,12 @@ class InfogainLossLayer : public LossLayer<Dtype> {
   virtual inline int MinBottomBlobs() const { return 2; }
   virtual inline int MaxBottomBlobs() const { return 3; }
 
+  // InfogainLossLayer computes softmax prob internally.
+  // optional second "top" outputs the softmax prob
+  virtual inline int ExactNumTopBlobs() const { return -1; }
+  virtual inline int MinTopBlobs() const { return 1; }
+  virtual inline int MaxTopBlobs() const { return 2; }
+
   virtual inline const char* type() const { return "InfogainLoss"; }
 
  protected:
@@ -91,8 +99,8 @@ class InfogainLossLayer : public LossLayer<Dtype> {
    *      infogain matrix, if provided as bottom[2])
    * @param bottom input Blob vector (length 2-3)
    *   -# @f$ (N \times C \times H \times W) @f$
-   *      the predictions @f$ \hat{p} @f$; Backward computes diff
-   *      @f$ \frac{\partial E}{\partial \hat{p}} @f$
+   *      the predictions @f$ x @f$; Backward computes diff
+   *      @f$ \frac{\partial E}{\partial x} @f$
    *   -# @f$ (N \times 1 \times 1 \times 1) @f$
    *      the labels -- ignored as we can't compute their error gradients
    *   -# @f$ (1 \times 1 \times K \times K) @f$
@@ -102,7 +110,35 @@ class InfogainLossLayer : public LossLayer<Dtype> {
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
 
+  /// Read the normalization mode parameter and compute the normalizer based
+  /// on the blob size.  If normalization_mode is VALID, the count of valid
+  /// outputs will be read from valid_count, unless it is -1 in which case
+  /// all outputs are assumed to be valid.
+  virtual Dtype get_normalizer(
+      LossParameter_NormalizationMode normalization_mode, int valid_count);
+  /// fill sum_rows_H_ according to matrix H
+  virtual void sum_rows_of_H(const Blob<Dtype>* H);
+
+  /// The internal SoftmaxLayer used to map predictions to a distribution.
+  shared_ptr<Layer<Dtype> > softmax_layer_;
+  /// prob stores the output probability predictions from the SoftmaxLayer.
+  Blob<Dtype> prob_;
+  /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
+  vector<Blob<Dtype>*> softmax_bottom_vec_;
+  /// top vector holder used in call to the underlying SoftmaxLayer::Forward
+  vector<Blob<Dtype>*> softmax_top_vec_;
+
   Blob<Dtype> infogain_;
+  Blob<Dtype> sum_rows_H_;  // cache the row sums of H.
+
+  /// Whether to ignore instances with a certain label.
+  bool has_ignore_label_;
+  /// The label indicating that an instance should be ignored.
+  int ignore_label_;
+  /// How to normalize the output loss.
+  LossParameter_NormalizationMode normalization_;
+
+  int infogain_axis_, outer_num_, inner_num_, num_labels_;
 };
 
 }  // namespace caffe
diff --git a/include/caffe/layers/input_layer.hpp b/include/caffe/layers/input_layer.hpp
index f4472678c69..0ffdc724894 100644
--- a/include/caffe/layers/input_layer.hpp
+++ b/include/caffe/layers/input_layer.hpp
@@ -22,8 +22,6 @@ class InputLayer : public Layer<Dtype> {
       : Layer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-  // Data layers should be shared by multiple solvers in parallel
-  virtual inline bool ShareInParallel() const { return true; }
   // Data layers have no bottoms, so reshaping is trivial.
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
diff --git a/include/caffe/layers/pooling_layer.hpp b/include/caffe/layers/pooling_layer.hpp
index f4d6803ba8e..38a432832cf 100644
--- a/include/caffe/layers/pooling_layer.hpp
+++ b/include/caffe/layers/pooling_layer.hpp
@@ -51,6 +51,7 @@ class PoolingLayer : public Layer<Dtype> {
   int height_, width_;
   int pooled_height_, pooled_width_;
   bool global_pooling_;
+  PoolingParameter_RoundMode round_mode_;
   Blob<Dtype> rand_idx_;
   Blob<int> max_idx_;
 };
diff --git a/include/caffe/layers/python_layer.hpp b/include/caffe/layers/python_layer.hpp
index 10c4bfd0250..1407d9217aa 100644
--- a/include/caffe/layers/python_layer.hpp
+++ b/include/caffe/layers/python_layer.hpp
@@ -34,10 +34,6 @@ class PythonLayer : public Layer<Dtype> {
     self_.attr("reshape")(bottom, top);
   }
 
-  virtual inline bool ShareInParallel() const {
-    return this->layer_param_.python_param().share_in_parallel();
-  }
-
   virtual inline const char* type() const { return "Python"; }
 
  protected:
diff --git a/include/caffe/layers/swish_layer.hpp b/include/caffe/layers/swish_layer.hpp
new file mode 100644
index 00000000000..d538ff6de82
--- /dev/null
+++ b/include/caffe/layers/swish_layer.hpp
@@ -0,0 +1,96 @@
+#ifndef CAFFE_SWISH_LAYER_HPP_
+#define CAFFE_SWISH_LAYER_HPP_
+
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/neuron_layer.hpp"
+#include "caffe/layers/sigmoid_layer.hpp"
+
+namespace caffe {
+
+/**
+ * @brief Swish non-linearity @f$ y = x \sigma (\beta x) @f$.
+ *        A novel activation function that tends to work better than ReLU [1].
+ *
+ * [1] Prajit Ramachandran, Barret Zoph, Quoc V. Le. "Searching for
+ *     Activation Functions". arXiv preprint arXiv:1710.05941v2 (2017).
+ */
+template <typename Dtype>
+class SwishLayer : public NeuronLayer<Dtype> {
+ public:
+  /**
+   * @param param provides SwishParameter swish_param,
+   *     with SwishLayer options:
+   *   - beta (\b optional, default 1).
+   *     the value @f$ \beta @f$ in the @f$ y = x \sigma (\beta x) @f$.
+   */
+  explicit SwishLayer(const LayerParameter& param)
+      : NeuronLayer<Dtype>(param),
+        sigmoid_layer_(new SigmoidLayer<Dtype>(param)),
+        sigmoid_input_(new Blob<Dtype>()),
+        sigmoid_output_(new Blob<Dtype>()) {}
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  virtual inline const char* type() const { return "Swish"; }
+
+ protected:
+  /**
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$
+   * @param top output Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the computed outputs @f$
+   *        y = x \sigma (\beta x)
+   *      @f$.
+   */
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+
+  /**
+   * @brief Computes the error gradient w.r.t. the sigmoid inputs.
+   *
+   * @param top output Blob vector (length 1), providing the error gradient with
+   *      respect to the outputs
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      containing error gradients @f$ \frac{\partial E}{\partial y} @f$
+   *      with respect to computed outputs @f$ y @f$
+   * @param propagate_down see Layer::Backward.
+   * @param bottom input Blob vector (length 1)
+   *   -# @f$ (N \times C \times H \times W) @f$
+   *      the inputs @f$ x @f$; Backward fills their diff with
+   *      gradients @f$
+   *        \frac{\partial E}{\partial x}
+   *            = \frac{\partial E}{\partial y}(\beta y +
+   *              \sigma (\beta x)(1 - \beta y))
+   *      @f$ if propagate_down[0]
+   */
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
+  /// The internal SigmoidLayer
+  shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
+  /// sigmoid_input_ stores the input of the SigmoidLayer.
+  shared_ptr<Blob<Dtype> > sigmoid_input_;
+  /// sigmoid_output_ stores the output of the SigmoidLayer.
+  shared_ptr<Blob<Dtype> > sigmoid_output_;
+  /// bottom vector holder to call the underlying SigmoidLayer::Forward
+  vector<Blob<Dtype>*> sigmoid_bottom_vec_;
+  /// top vector holder to call the underlying SigmoidLayer::Forward
+  vector<Blob<Dtype>*> sigmoid_top_vec_;
+};
+
+}  // namespace caffe
+
+#endif  // CAFFE_SWISH_LAYER_HPP_
diff --git a/include/caffe/layers/window_data_layer.hpp b/include/caffe/layers/window_data_layer.hpp
index 35f41b80e63..b9b66b7cf1d 100644
--- a/include/caffe/layers/window_data_layer.hpp
+++ b/include/caffe/layers/window_data_layer.hpp
@@ -16,7 +16,8 @@ namespace caffe {
 
 /**
  * @brief Provides data to the Net from windows of images files, specified
- *        by a window data file.
+ *        by a window data file. This layer is *DEPRECATED* and only kept for
+ *        archival purposes for use by the original R-CNN.
  *
  * TODO(dox): thorough documentation for Forward and proto params.
  */
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index d3c9306e9cf..143d5d28883 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -111,9 +111,9 @@ class Net {
    *        another Net.
    */
   void CopyTrainedLayersFrom(const NetParameter& param);
-  void CopyTrainedLayersFrom(const string trained_filename);
-  void CopyTrainedLayersFromBinaryProto(const string trained_filename);
-  void CopyTrainedLayersFromHDF5(const string trained_filename);
+  void CopyTrainedLayersFrom(const string& trained_filename);
+  void CopyTrainedLayersFromBinaryProto(const string& trained_filename);
+  void CopyTrainedLayersFromHDF5(const string& trained_filename);
   /// @brief Writes the net to a proto.
   void ToProto(NetParameter* param, bool write_diff = false) const;
   /// @brief Writes the net to an HDF5 file.
diff --git a/include/caffe/sgd_solvers.hpp b/include/caffe/sgd_solvers.hpp
index 1fc52d87137..925ff78331e 100644
--- a/include/caffe/sgd_solvers.hpp
+++ b/include/caffe/sgd_solvers.hpp
@@ -23,10 +23,11 @@ class SGDSolver : public Solver<Dtype> {
 
   const vector<shared_ptr<Blob<Dtype> > >& history() { return history_; }
 
+  virtual void ApplyUpdate();
+  Dtype GetLearningRate();
+
  protected:
   void PreSolve();
-  Dtype GetLearningRate();
-  virtual void ApplyUpdate();
   virtual void Normalize(int param_id);
   virtual void Regularize(int param_id);
   virtual void ComputeUpdateValue(int param_id, Dtype rate);
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
index a28d8cb897e..7a0d7777f2d 100644
--- a/include/caffe/solver.hpp
+++ b/include/caffe/solver.hpp
@@ -55,7 +55,7 @@ class Solver {
   // The main entry of the solver function. In default, iter will be zero. Pass
   // in a non-zero iter number to resume training for a pre-trained net.
   virtual void Solve(const char* resume_file = NULL);
-  inline void Solve(const string resume_file) { Solve(resume_file.c_str()); }
+  inline void Solve(const string& resume_file) { Solve(resume_file.c_str()); }
   void Step(int iters);
   // The Restore method simply dispatches to one of the
   // RestoreSolverStateFrom___ protected methods. You should implement these
@@ -94,10 +94,11 @@ class Solver {
    */
   virtual inline const char* type() const { return ""; }
 
- protected:
   // Make and apply the update value for the current iteration.
   virtual void ApplyUpdate() = 0;
-  string SnapshotFilename(const string extension);
+
+ protected:
+  string SnapshotFilename(const string& extension);
   string SnapshotToBinaryProto();
   string SnapshotToHDF5();
   // The test routine
diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
index 317ce29a257..8d650a34a8e 100644
--- a/include/caffe/syncedmem.hpp
+++ b/include/caffe/syncedmem.hpp
@@ -66,8 +66,8 @@ class SyncedMemory {
   void* mutable_cpu_data();
   void* mutable_gpu_data();
   enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
-  SyncedHead head() { return head_; }
-  size_t size() { return size_; }
+  SyncedHead head() const { return head_; }
+  size_t size() const { return size_; }
 
 #ifndef CPU_ONLY
   void async_gpu_push(const cudaStream_t& stream);
diff --git a/include/caffe/test/test_caffe_main.hpp b/include/caffe/test/test_caffe_main.hpp
index fc156091476..294f7e5011a 100644
--- a/include/caffe/test/test_caffe_main.hpp
+++ b/include/caffe/test/test_caffe_main.hpp
@@ -18,9 +18,8 @@ using std::endl;
   #include "caffe_config.h"
 #else
   #define CUDA_TEST_DEVICE -1
-  #define CMAKE_SOURCE_DIR "src/"
   #define EXAMPLES_SOURCE_DIR "examples/"
-  #define CMAKE_EXT ""
+  #define ABS_TEST_DATA_DIR "src/caffe/test/test_data"
 #endif
 
 int main(int argc, char** argv);
diff --git a/include/caffe/util/cudnn.hpp b/include/caffe/util/cudnn.hpp
index a7d8dbbad4c..cd3f93f6e28 100644
--- a/include/caffe/util/cudnn.hpp
+++ b/include/caffe/util/cudnn.hpp
@@ -41,6 +41,16 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
       return "CUDNN_STATUS_NOT_SUPPORTED";
     case CUDNN_STATUS_LICENSE_ERROR:
       return "CUDNN_STATUS_LICENSE_ERROR";
+#if CUDNN_VERSION_MIN(6, 0, 0)
+    case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING:
+      return "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING";
+#endif
+#if CUDNN_VERSION_MIN(7, 0, 0)
+    case CUDNN_STATUS_RUNTIME_IN_PROGRESS:
+      return "CUDNN_STATUS_RUNTIME_IN_PROGRESS";
+    case CUDNN_STATUS_RUNTIME_FP_OVERFLOW:
+      return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
+#endif
   }
   return "Unknown cudnn status";
 }
@@ -109,8 +119,14 @@ template <typename Dtype>
 inline void setConvolutionDesc(cudnnConvolutionDescriptor_t* conv,
     cudnnTensorDescriptor_t bottom, cudnnFilterDescriptor_t filter,
     int pad_h, int pad_w, int stride_h, int stride_w) {
+#if CUDNN_VERSION_MIN(6, 0, 0)
   CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
+      pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION,
+      dataType<Dtype>::type));
+#else
+    CUDNN_CHECK(cudnnSetConvolution2dDescriptor(*conv,
       pad_h, pad_w, stride_h, stride_w, 1, 1, CUDNN_CROSS_CORRELATION));
+#endif
 }
 
 template <typename Dtype>
diff --git a/include/caffe/util/hdf5.hpp b/include/caffe/util/hdf5.hpp
index 71549c1cc02..dbd8bb6c5e4 100644
--- a/include/caffe/util/hdf5.hpp
+++ b/include/caffe/util/hdf5.hpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
 #ifndef CAFFE_UTIL_HDF5_H_
 #define CAFFE_UTIL_HDF5_H_
 
@@ -37,3 +38,4 @@ string hdf5_get_name_by_idx(hid_t loc_id, int idx);
 }  // namespace caffe
 
 #endif   // CAFFE_UTIL_HDF5_H_
+#endif   // USE_HDF5
diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
index 51068fe2b80..e549120a933 100644
--- a/include/caffe/util/math_functions.hpp
+++ b/include/caffe/util/math_functions.hpp
@@ -52,6 +52,9 @@ void caffe_scal(const int N, const Dtype alpha, Dtype *X);
 template <typename Dtype>
 void caffe_sqr(const int N, const Dtype* a, Dtype* y);
 
+template <typename Dtype>
+void caffe_sqrt(const int N, const Dtype* a, Dtype* y);
+
 template <typename Dtype>
 void caffe_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 
@@ -128,16 +131,16 @@ inline int8_t caffe_sign(Dtype val) {
   }
 
 // output is 1 for the positives, 0 for zero, and -1 for the negatives
-DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]));
+DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]))
 
 // This returns a nonzero value if the input has its sign bit set.
 // The name sngbit is meant to avoid conflicts with std::signbit in the macro.
 // The extra parens are needed because CUDA < 6.5 defines signbit as a macro,
 // and we don't want that to expand here when CUDA headers are also included.
 DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, \
-    y[i] = static_cast<bool>((std::signbit)(x[i])));
+    y[i] = static_cast<bool>((std::signbit)(x[i])))
 
-DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]));
+DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]))
 
 template <typename Dtype>
 void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
@@ -214,6 +217,9 @@ void caffe_gpu_log(const int n, const Dtype* a, Dtype* y);
 template <typename Dtype>
 void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 
+template <typename Dtype>
+void caffe_gpu_sqrt(const int n, const Dtype* a, Dtype* y);
+
 // caffe_gpu_rng_uniform with two arguments generates integers in the range
 // [0, UINT_MAX].
 void caffe_gpu_rng_uniform(const int n, unsigned int* r);
diff --git a/include/caffe/util/mkl_alternate.hpp b/include/caffe/util/mkl_alternate.hpp
index 95df0f93b5e..8c2294c7c86 100644
--- a/include/caffe/util/mkl_alternate.hpp
+++ b/include/caffe/util/mkl_alternate.hpp
@@ -36,10 +36,11 @@ extern "C" {
     v##name<double>(n, a, y); \
   }
 
-DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i]);
-DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]));
-DEFINE_VSL_UNARY_FUNC(Ln, y[i] = log(a[i]));
-DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]));
+DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i])
+DEFINE_VSL_UNARY_FUNC(Sqrt, y[i] = sqrt(a[i]))
+DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]))
+DEFINE_VSL_UNARY_FUNC(Ln, y[i] = log(a[i]))
+DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]))
 
 // A simple way to define the vsl unary functions with singular parameter b.
 // The operation should be in the form e.g. y[i] = pow(a[i], b)
@@ -58,7 +59,7 @@ DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]));
     v##name<double>(n, a, b, y); \
   }
 
-DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b));
+DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b))
 
 // A simple way to define the vsl binary functions. The operation should
 // be in the form e.g. y[i] = a[i] + b[i]
@@ -77,10 +78,10 @@ DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b));
     v##name<double>(n, a, b, y); \
   }
 
-DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i]);
-DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i]);
-DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i]);
-DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]);
+DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i])
+DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i])
+DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i])
+DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i])
 
 // In addition, MKL comes with an additional function axpby that is not present
 // in standard blas. We will simply use a two-step (inefficient, of course) way
diff --git a/include/caffe/util/signal_handler.h b/include/caffe/util/signal_handler.h
index fb84c65bd2e..5246332581e 100644
--- a/include/caffe/util/signal_handler.h
+++ b/include/caffe/util/signal_handler.h
@@ -8,7 +8,7 @@ namespace caffe {
 
 class SignalHandler {
  public:
-  // Contructor. Specify what action to take when a signal is received.
+  // Constructor. Specify what action to take when a signal is received.
   SignalHandler(SolverAction::Enum SIGINT_action,
                 SolverAction::Enum SIGHUP_action);
   ~SignalHandler();
diff --git a/matlab/+caffe/Net.m b/matlab/+caffe/Net.m
index 349e060eb22..bb99ec89049 100644
--- a/matlab/+caffe/Net.m
+++ b/matlab/+caffe/Net.m
@@ -69,7 +69,9 @@
       self.blob_names = self.attributes.blob_names;
     end
     function delete (self)
-      caffe_('delete_net', self.hNet_self);
+      if ~isempty(self.hNet_self)
+        caffe_('delete_net', self.hNet_self);
+      end
     end
     function layer = layers(self, layer_name)
       CHECK(ischar(layer_name), 'layer_name must be a string');
diff --git a/models/bvlc_alexnet/readme.md b/models/bvlc_alexnet/readme.md
index 008d690f7f4..a83e3d4e27c 100644
--- a/models/bvlc_alexnet/readme.md
+++ b/models/bvlc_alexnet/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC AlexNet Model
+name: BAIR/BVLC AlexNet Model
 caffemodel: bvlc_alexnet.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel
 license: unrestricted
diff --git a/models/bvlc_googlenet/readme.md b/models/bvlc_googlenet/readme.md
index 061b6d74530..ef04db62ab2 100644
--- a/models/bvlc_googlenet/readme.md
+++ b/models/bvlc_googlenet/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC GoogleNet Model
+name: BAIR/BVLC GoogleNet Model
 caffemodel: bvlc_googlenet.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel
 license: unrestricted
diff --git a/models/bvlc_reference_caffenet/readme.md b/models/bvlc_reference_caffenet/readme.md
index 671e47a5056..5352e536a07 100644
--- a/models/bvlc_reference_caffenet/readme.md
+++ b/models/bvlc_reference_caffenet/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC CaffeNet Model
+name: BAIR/BVLC CaffeNet Model
 caffemodel: bvlc_reference_caffenet.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_reference_caffenet.caffemodel
 license: unrestricted
diff --git a/models/bvlc_reference_rcnn_ilsvrc13/readme.md b/models/bvlc_reference_rcnn_ilsvrc13/readme.md
index 9a11a24d8f8..12543b2bd2c 100644
--- a/models/bvlc_reference_rcnn_ilsvrc13/readme.md
+++ b/models/bvlc_reference_rcnn_ilsvrc13/readme.md
@@ -1,5 +1,5 @@
 ---
-name: BVLC Reference RCNN ILSVRC13 Model
+name: BAIR/BVLC Reference RCNN ILSVRC13 Model
 caffemodel: bvlc_reference_rcnn_ilsvrc13.caffemodel
 caffemodel_url: http://dl.caffe.berkeleyvision.org/bvlc_reference_rcnn_ilsvrc13.caffemodel
 license: unrestricted
diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
index 43a0c49be63..776945eec88 100644
--- a/python/caffe/__init__.py
+++ b/python/caffe/__init__.py
@@ -1,5 +1,5 @@
 from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver, NCCL, Timer
-from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, set_multiprocess, Layer, get_solver
+from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, solver_count, set_solver_count, solver_rank, set_solver_rank, set_multiprocess, has_nccl
 from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
diff --git a/python/caffe/_caffe.cpp b/python/caffe/_caffe.cpp
index be011699098..82bf21e6e16 100644
--- a/python/caffe/_caffe.cpp
+++ b/python/caffe/_caffe.cpp
@@ -51,14 +51,18 @@ const int NPY_DTYPE = NPY_FLOAT32;
 void set_mode_cpu() { Caffe::set_mode(Caffe::CPU); }
 void set_mode_gpu() { Caffe::set_mode(Caffe::GPU); }
 
-void InitLog(int level) {
-  FLAGS_logtostderr = 1;
-  FLAGS_minloglevel = level;
+void InitLog() {
   ::google::InitGoogleLogging("");
   ::google::InstallFailureSignalHandler();
 }
-void InitLogInfo() {
-  InitLog(google::INFO);
+void InitLogLevel(int level) {
+  FLAGS_minloglevel = level;
+  InitLog();
+}
+void InitLogLevelPipe(int level, bool stderr) {
+  FLAGS_minloglevel = level;
+  FLAGS_logtostderr = stderr;
+  InitLog();
 }
 void Log(const string& s) {
   LOG(INFO) << s;
@@ -298,6 +302,10 @@ void Solver_add_nccl(Solver<Dtype>* solver
 #endif
 }
 
+void share_weights(Solver<Dtype>* solver, Net<Dtype>* net) {
+  net->ShareTrainedLayersWith(solver->net().get());
+}
+
 template<typename Dtype>
 class NetCallback: public Net<Dtype>::Callback {
  public:
@@ -339,6 +347,35 @@ class NCCL {
 };
 #endif
 
+bool HasNCCL() {
+#ifdef USE_NCCL
+  return true;
+#else
+  return false;
+#endif
+}
+
+#ifdef USE_NCCL
+bp::object NCCL_New_Uid() {
+  std::string uid = NCCL<Dtype>::new_uid();
+#if PY_MAJOR_VERSION >= 3
+  // Convert std::string to bytes so that Python does not
+  // try to decode the string using the current locale.
+
+  // Since boost 1.53 boost.python will convert str and bytes
+  // to std::string but will convert std::string to str. Here we
+  // force a bytes object to be returned. When this object
+  // is passed back to the NCCL constructor boost.python will
+  // correctly convert the bytes to std::string automatically
+  PyObject* py_uid = PyBytes_FromString(uid.c_str());
+  return bp::object(bp::handle<>(py_uid));
+#else
+  // automatic conversion is correct for python 2.
+  return bp::object(uid);
+#endif
+}
+#endif
+
 BOOST_PYTHON_MEMBER_FUNCTION_OVERLOADS(SolveOverloads, Solve, 0, 1);
 
 BOOST_PYTHON_MODULE(_caffe) {
@@ -349,8 +386,10 @@ BOOST_PYTHON_MODULE(_caffe) {
 
   // Caffe utility functions
   bp::def("init_log", &InitLog);
-  bp::def("init_log", &InitLogInfo);
+  bp::def("init_log", &InitLogLevel);
+  bp::def("init_log", &InitLogLevelPipe);
   bp::def("log", &Log);
+  bp::def("has_nccl", &HasNCCL);
   bp::def("set_mode_cpu", &set_mode_cpu);
   bp::def("set_mode_gpu", &set_mode_gpu);
   bp::def("set_random_seed", &set_random_seed);
@@ -377,7 +416,7 @@ BOOST_PYTHON_MODULE(_caffe) {
     .def("reshape", &Net<Dtype>::Reshape)
     .def("clear_param_diffs", &Net<Dtype>::ClearParamDiffs)
     // The cast is to select a particular overload.
-    .def("copy_from", static_cast<void (Net<Dtype>::*)(const string)>(
+    .def("copy_from", static_cast<void (Net<Dtype>::*)(const string&)>(
         &Net<Dtype>::CopyTrainedLayersFrom))
     .def("share_with", &Net<Dtype>::ShareTrainedLayersWith)
     .add_property("_blob_loss_weights", bp::make_function(
@@ -425,6 +464,14 @@ BOOST_PYTHON_MODULE(_caffe) {
     .add_property("count",    static_cast<int (Blob<Dtype>::*)() const>(
         &Blob<Dtype>::count))
     .def("reshape",           bp::raw_function(&Blob_Reshape))
+#ifndef CPU_ONLY
+    .add_property("_gpu_data_ptr",
+        reinterpret_cast<uintptr_t (Blob<Dtype>::*)()>(
+          &Blob<Dtype>::mutable_gpu_data))
+    .add_property("_gpu_diff_ptr",
+        reinterpret_cast<uintptr_t (Blob<Dtype>::*)()>(
+          &Blob<Dtype>::mutable_gpu_diff))
+#endif
     .add_property("data",     bp::make_function(&Blob<Dtype>::mutable_cpu_data,
           NdarrayCallPolicies()))
     .add_property("diff",     bp::make_function(&Blob<Dtype>::mutable_cpu_diff,
@@ -443,7 +490,9 @@ BOOST_PYTHON_MODULE(_caffe) {
   bp::class_<SolverParameter>("SolverParameter", bp::no_init)
     .add_property("max_iter", &SolverParameter::max_iter)
     .add_property("display", &SolverParameter::display)
-    .add_property("layer_wise_reduce", &SolverParameter::layer_wise_reduce);
+    .add_property("layer_wise_reduce", &SolverParameter::layer_wise_reduce)
+    .add_property("base_lr", &SolverParameter::base_lr,
+           &SolverParameter::set_base_lr);
   bp::class_<LayerParameter>("LayerParameter", bp::no_init);
 
   bp::class_<Solver<Dtype>, shared_ptr<Solver<Dtype> >, boost::noncopyable>(
@@ -459,26 +508,29 @@ BOOST_PYTHON_MODULE(_caffe) {
     .def("step", &Solver<Dtype>::Step)
     .def("restore", &Solver<Dtype>::Restore)
     .def("snapshot", &Solver<Dtype>::Snapshot)
+    .def("share_weights", &share_weights)
+    .def("apply_update", &Solver<Dtype>::ApplyUpdate)
     .add_property("param", bp::make_function(&Solver<Dtype>::param,
-              bp::return_value_policy<bp::copy_const_reference>()));
+              bp::return_internal_reference<>()));
   BP_REGISTER_SHARED_PTR_TO_PYTHON(Solver<Dtype>);
 
   bp::class_<SGDSolver<Dtype>, bp::bases<Solver<Dtype> >,
     shared_ptr<SGDSolver<Dtype> >, boost::noncopyable>(
-        "SGDSolver", bp::init<string>());
-  bp::class_<NesterovSolver<Dtype>, bp::bases<Solver<Dtype> >,
+        "SGDSolver", bp::init<string>())
+        .add_property("lr", &SGDSolver<Dtype>::GetLearningRate);
+  bp::class_<NesterovSolver<Dtype>, bp::bases<SGDSolver<Dtype> >,
     shared_ptr<NesterovSolver<Dtype> >, boost::noncopyable>(
         "NesterovSolver", bp::init<string>());
-  bp::class_<AdaGradSolver<Dtype>, bp::bases<Solver<Dtype> >,
+  bp::class_<AdaGradSolver<Dtype>, bp::bases<SGDSolver<Dtype> >,
     shared_ptr<AdaGradSolver<Dtype> >, boost::noncopyable>(
         "AdaGradSolver", bp::init<string>());
-  bp::class_<RMSPropSolver<Dtype>, bp::bases<Solver<Dtype> >,
+  bp::class_<RMSPropSolver<Dtype>, bp::bases<SGDSolver<Dtype> >,
     shared_ptr<RMSPropSolver<Dtype> >, boost::noncopyable>(
         "RMSPropSolver", bp::init<string>());
-  bp::class_<AdaDeltaSolver<Dtype>, bp::bases<Solver<Dtype> >,
+  bp::class_<AdaDeltaSolver<Dtype>, bp::bases<SGDSolver<Dtype> >,
     shared_ptr<AdaDeltaSolver<Dtype> >, boost::noncopyable>(
         "AdaDeltaSolver", bp::init<string>());
-  bp::class_<AdamSolver<Dtype>, bp::bases<Solver<Dtype> >,
+  bp::class_<AdamSolver<Dtype>, bp::bases<SGDSolver<Dtype> >,
     shared_ptr<AdamSolver<Dtype> >, boost::noncopyable>(
         "AdamSolver", bp::init<string>());
 
@@ -508,7 +560,7 @@ BOOST_PYTHON_MODULE(_caffe) {
     boost::noncopyable>("NCCL",
                         bp::init<shared_ptr<Solver<Dtype> >, const string&>())
 #ifdef USE_NCCL
-    .def("new_uid", &NCCL<Dtype>::new_uid).staticmethod("new_uid")
+    .def("new_uid", NCCL_New_Uid).staticmethod("new_uid")
     .def("bcast", &NCCL<Dtype>::Broadcast)
 #endif
     /* NOLINT_NEXT_LINE(whitespace/semicolon) */
diff --git a/python/caffe/classifier.py b/python/caffe/classifier.py
index ea29fed86f9..64d804be554 100644
--- a/python/caffe/classifier.py
+++ b/python/caffe/classifier.py
@@ -23,7 +23,7 @@ class Classifier(caffe.Net):
     def __init__(self, model_file, pretrained_file, image_dims=None,
                  mean=None, input_scale=None, raw_scale=None,
                  channel_swap=None):
-        caffe.Net.__init__(self, model_file, pretrained_file, caffe.TEST)
+        caffe.Net.__init__(self, model_file, caffe.TEST, weights=pretrained_file)
 
         # configure pre-processing
         in_ = self.inputs[0]
@@ -92,7 +92,7 @@ def predict(self, inputs, oversample=True):
 
         # For oversampling, average predictions across crops.
         if oversample:
-            predictions = predictions.reshape((len(predictions) / 10, 10, -1))
+            predictions = predictions.reshape((len(predictions) // 10, 10, -1))
             predictions = predictions.mean(1)
 
         return predictions
diff --git a/python/caffe/detector.py b/python/caffe/detector.py
index ef1f91730bf..ceee5d36f4c 100644
--- a/python/caffe/detector.py
+++ b/python/caffe/detector.py
@@ -35,7 +35,7 @@ class Detector(caffe.Net):
     def __init__(self, model_file, pretrained_file, mean=None,
                  input_scale=None, raw_scale=None, channel_swap=None,
                  context_pad=None):
-        caffe.Net.__init__(self, model_file, pretrained_file, caffe.TEST)
+        caffe.Net.__init__(self, model_file, caffe.TEST, weights=pretrained_file)
 
         # configure pre-processing
         in_ = self.inputs[0]
diff --git a/python/caffe/draw.py b/python/caffe/draw.py
index e4fd7aacce7..0061f490bd9 100644
--- a/python/caffe/draw.py
+++ b/python/caffe/draw.py
@@ -59,18 +59,60 @@ def get_edge_label(layer):
     return edge_label
 
 
-def get_layer_label(layer, rankdir):
+def get_layer_lr_mult(layer):
+    """Get the learning rate multipliers.
+
+    Get the learning rate multipliers for the given layer. Assumes a
+    Convolution/Deconvolution/InnerProduct layer.
+
+    Parameters
+    ----------
+    layer : caffe_pb2.LayerParameter
+        A Convolution, Deconvolution, or InnerProduct layer.
+
+    Returns
+    -------
+    learning_rates : tuple of floats
+        the learning rate multipliers for the weights and biases.
+    """
+    if layer.type not in ['Convolution', 'Deconvolution', 'InnerProduct']:
+        raise ValueError("%s layers do not have a "
+                         "learning rate multiplier" % layer.type)
+
+    if not hasattr(layer, 'param'):
+        return (1.0, 1.0)
+
+    params = getattr(layer, 'param')
+
+    if len(params) == 0:
+        return (1.0, 1.0)
+
+    if len(params) == 1:
+        lrm0 = getattr(params[0],'lr_mult', 1.0)
+        return (lrm0, 1.0)
+
+    if len(params) == 2:
+        lrm0, lrm1 = [getattr(p,'lr_mult', 1.0) for p in params]
+        return (lrm0, lrm1)
+
+    raise ValueError("Could not parse the learning rate multiplier")
+
+
+def get_layer_label(layer, rankdir, display_lrm=False):
     """Define node label based on layer type.
 
     Parameters
     ----------
-    layer : ?
+    layer : caffe_pb2.LayerParameter
     rankdir : {'LR', 'TB', 'BT'}
         Direction of graph layout.
+    display_lrm : boolean, optional
+        If True include the learning rate multipliers in the label (default is
+        False).
 
     Returns
     -------
-    string :
+    node_label : string
         A label for the current layer
     """
 
@@ -81,36 +123,54 @@ def get_layer_label(layer, rankdir):
     else:
         # If graph orientation is horizontal, vertical space is free and
         # horizontal space is not; separate words with newlines
-        separator = '\\n'
-
-    if layer.type == 'Convolution' or layer.type == 'Deconvolution':
-        # Outer double quotes needed or else colon characters don't parse
-        # properly
-        node_label = '"%s%s(%s)%skernel size: %d%sstride: %d%spad: %d"' %\
-                     (layer.name,
-                      separator,
-                      layer.type,
-                      separator,
-                      layer.convolution_param.kernel_size[0] if len(layer.convolution_param.kernel_size._values) else 1,
-                      separator,
-                      layer.convolution_param.stride[0] if len(layer.convolution_param.stride._values) else 1,
-                      separator,
-                      layer.convolution_param.pad[0] if len(layer.convolution_param.pad._values) else 0)
-    elif layer.type == 'Pooling':
+        separator = r'\n'
+
+    # Initializes a list of descriptors that will be concatenated into the
+    # `node_label`
+    descriptors_list = []
+    # Add the layer's name
+    descriptors_list.append(layer.name)
+    # Add layer's type
+    if layer.type == 'Pooling':
         pooling_types_dict = get_pooling_types_dict()
-        node_label = '"%s%s(%s %s)%skernel size: %d%sstride: %d%spad: %d"' %\
-                     (layer.name,
-                      separator,
-                      pooling_types_dict[layer.pooling_param.pool],
-                      layer.type,
-                      separator,
-                      layer.pooling_param.kernel_size[0] if len(layer.pooling_param.kernel_size._values) else 1,
-                      separator,
-                      layer.pooling_param.stride[0] if len(layer.pooling_param.stride._values) else 1,
-                      separator,
-                      layer.pooling_param.pad[0] if len(layer.pooling_param.pad._values) else 0)
+        layer_type = '(%s %s)' % (layer.type,
+                                  pooling_types_dict[layer.pooling_param.pool])
     else:
-        node_label = '"%s%s(%s)"' % (layer.name, separator, layer.type)
+        layer_type = '(%s)' % layer.type
+    descriptors_list.append(layer_type)
+
+    # Describe parameters for spatial operation layers
+    if layer.type in ['Convolution', 'Deconvolution', 'Pooling']:
+        if layer.type == 'Pooling':
+            kernel_size = layer.pooling_param.kernel_size
+            stride = layer.pooling_param.stride
+            padding = layer.pooling_param.pad
+        else:
+            kernel_size = layer.convolution_param.kernel_size[0] if \
+                len(layer.convolution_param.kernel_size) else 1
+            stride = layer.convolution_param.stride[0] if \
+                len(layer.convolution_param.stride) else 1
+            padding = layer.convolution_param.pad[0] if \
+                len(layer.convolution_param.pad) else 0
+        spatial_descriptor = separator.join([
+            "kernel size: %d" % kernel_size,
+            "stride: %d" % stride,
+            "pad: %d" % padding,
+        ])
+        descriptors_list.append(spatial_descriptor)
+
+    # Add LR multiplier for learning layers
+    if display_lrm and layer.type in ['Convolution', 'Deconvolution', 'InnerProduct']:
+        lrm0, lrm1 = get_layer_lr_mult(layer)
+        if any([lrm0, lrm1]):
+            lr_mult = "lr mult: %.1f, %.1f" % (lrm0, lrm1)
+            descriptors_list.append(lr_mult)
+
+    # Concatenate the descriptors into one label
+    node_label = separator.join(descriptors_list)
+    # Outer double quotes needed or else colon characters don't parse
+    # properly
+    node_label = '"%s"' % node_label
     return node_label
 
 
@@ -127,7 +187,7 @@ def choose_color_by_layertype(layertype):
     return color
 
 
-def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None):
+def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None, display_lrm=False):
     """Create a data structure which represents the `caffe_net`.
 
     Parameters
@@ -140,6 +200,9 @@ def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None):
     phase : {caffe_pb2.Phase.TRAIN, caffe_pb2.Phase.TEST, None} optional
         Include layers from this network phase.  If None, include all layers.
         (the default is None)
+    display_lrm : boolean, optional
+        If True display the learning rate multipliers when relevant (default is
+        False).
 
     Returns
     -------
@@ -164,7 +227,7 @@ def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None):
             included = included and not layer_phase.phase == phase
           if not included:
             continue
-        node_label = get_layer_label(layer, rankdir)
+        node_label = get_layer_label(layer, rankdir, display_lrm=display_lrm)
         node_name = "%s_%s" % (layer.name, layer.type)
         if (len(layer.bottom) == 1 and len(layer.top) == 1 and
            layer.bottom[0] == layer.top[0]):
@@ -202,7 +265,7 @@ def get_pydot_graph(caffe_net, rankdir, label_edges=True, phase=None):
     return pydot_graph
 
 
-def draw_net(caffe_net, rankdir, ext='png', phase=None):
+def draw_net(caffe_net, rankdir, ext='png', phase=None, display_lrm=False):
     """Draws a caffe net and returns the image string encoded using the given
     extension.
 
@@ -214,16 +277,20 @@ def draw_net(caffe_net, rankdir, ext='png', phase=None):
     phase : {caffe_pb2.Phase.TRAIN, caffe_pb2.Phase.TEST, None} optional
         Include layers from this network phase.  If None, include all layers.
         (the default is None)
+    display_lrm : boolean, optional
+        If True display the learning rate multipliers for the learning layers
+        (default is False).
 
     Returns
     -------
     string :
         Postscript representation of the graph.
     """
-    return get_pydot_graph(caffe_net, rankdir, phase=phase).create(format=ext)
+    return get_pydot_graph(caffe_net, rankdir, phase=phase,
+                           display_lrm=display_lrm).create(format=ext)
 
 
-def draw_net_to_file(caffe_net, filename, rankdir='LR', phase=None):
+def draw_net_to_file(caffe_net, filename, rankdir='LR', phase=None, display_lrm=False):
     """Draws a caffe net, and saves it to file using the format given as the
     file extension. Use '.raw' to output raw text that you can manually feed
     to graphviz to draw graphs.
@@ -238,7 +305,10 @@ def draw_net_to_file(caffe_net, filename, rankdir='LR', phase=None):
     phase : {caffe_pb2.Phase.TRAIN, caffe_pb2.Phase.TEST, None} optional
         Include layers from this network phase.  If None, include all layers.
         (the default is None)
+    display_lrm : boolean, optional
+        If True display the learning rate multipliers for the learning layers
+        (default is False).
     """
     ext = filename[filename.rfind('.')+1:]
     with open(filename, 'wb') as fid:
-        fid.write(draw_net(caffe_net, rankdir, ext, phase))
+        fid.write(draw_net(caffe_net, rankdir, ext, phase, display_lrm))
diff --git a/python/caffe/io.py b/python/caffe/io.py
index e1759beb587..d0103d60892 100644
--- a/python/caffe/io.py
+++ b/python/caffe/io.py
@@ -75,7 +75,7 @@ def array_to_datum(arr, label=None):
     if arr.dtype == np.uint8:
         datum.data = arr.tostring()
     else:
-        datum.float_data.extend(arr.flat)
+        datum.float_data.extend(arr.astype(float).flat)
     if label is not None:
         datum.label = label
     return datum
@@ -186,13 +186,14 @@ def deprocess(self, in_, data):
 
     def set_transpose(self, in_, order):
         """
-        Set the input channel order for e.g. RGB to BGR conversion
-        as needed for the reference ImageNet model.
+        Set the order of dimensions, e.g. to convert OpenCV's HxWxC images
+        into CxHxW.
 
         Parameters
         ----------
-        in_ : which input to assign this channel order
+        in_ : which input to assign this dimension order
         order : the order to transpose the dimensions
+            for example (2,0,1) changes HxWxC into CxHxW and (1,2,0) reverts
         """
         self.__check_input(in_)
         if len(order) != len(self.inputs[in_]) - 1:
@@ -256,7 +257,12 @@ def set_mean(self, in_, mean):
             if len(ms) != 3:
                 raise ValueError('Mean shape invalid')
             if ms != self.inputs[in_][1:]:
-                raise ValueError('Mean shape incompatible with input shape.')
+                in_shape = self.inputs[in_][1:]
+                m_min, m_max = mean.min(), mean.max()
+                normal_mean = (mean - m_min) / (m_max - m_min)
+                mean = resize_image(normal_mean.transpose((1,2,0)),
+                        in_shape[1:]).transpose((2,0,1)) * \
+                        (m_max - m_min) + m_min
         self.mean[in_] = mean
 
     def set_input_scale(self, in_, scale):
@@ -323,7 +329,7 @@ def resize_image(im, new_dims, interp_order=1):
             # skimage is fast but only understands {1,3} channel images
             # in [0, 1].
             im_std = (im - im_min) / (im_max - im_min)
-            resized_std = resize(im_std, new_dims, order=interp_order)
+            resized_std = resize(im_std, new_dims, order=interp_order, mode='constant')
             resized_im = resized_std * (im_max - im_min) + im_min
         else:
             # the image is a constant -- avoid divide by 0
diff --git a/python/caffe/net_spec.py b/python/caffe/net_spec.py
index 5fb1f0b3fb1..20918f9b6bc 100644
--- a/python/caffe/net_spec.py
+++ b/python/caffe/net_spec.py
@@ -103,6 +103,10 @@ class Function(object):
 
     def __init__(self, type_name, inputs, params):
         self.type_name = type_name
+        for index, input in enumerate(inputs):
+            if not isinstance(input, Top):
+                raise TypeError('%s input %d is not a Top (type is %s)' %
+                                (type_name, index, type(input)))
         self.inputs = inputs
         self.params = params
         self.ntop = self.params.get('ntop', 1)
diff --git a/python/caffe/pycaffe.py b/python/caffe/pycaffe.py
index 63606591bb4..4a7b5a24c46 100644
--- a/python/caffe/pycaffe.py
+++ b/python/caffe/pycaffe.py
@@ -113,7 +113,7 @@ def _Net_forward(self, blobs=None, start=None, end=None, **kwargs):
 
     if end is not None:
         end_ind = list(self._layer_names).index(end)
-        outputs = set([end] + blobs)
+        outputs = set(self.top_names[end] + blobs)
     else:
         end_ind = len(self.layers) - 1
         outputs = set(self.outputs + blobs)
@@ -161,7 +161,7 @@ def _Net_backward(self, diffs=None, start=None, end=None, **kwargs):
 
     if end is not None:
         end_ind = list(self._layer_names).index(end)
-        outputs = set([end] + diffs)
+        outputs = set(self.bottom_names[end] + diffs)
     else:
         end_ind = 0
         outputs = set(self.inputs + diffs)
diff --git a/python/caffe/test/test_draw.py b/python/caffe/test/test_draw.py
new file mode 100644
index 00000000000..835bb5df010
--- /dev/null
+++ b/python/caffe/test/test_draw.py
@@ -0,0 +1,37 @@
+import os
+import unittest
+
+from google.protobuf import text_format
+
+import caffe.draw
+from caffe.proto import caffe_pb2
+
+def getFilenames():
+    """Yields files in the source tree which are Net prototxts."""
+    result = []
+
+    root_dir = os.path.abspath(os.path.join(
+        os.path.dirname(__file__), '..', '..', '..'))
+    assert os.path.exists(root_dir)
+
+    for dirname in ('models', 'examples'):
+        dirname = os.path.join(root_dir, dirname)
+        assert os.path.exists(dirname)
+        for cwd, _, filenames in os.walk(dirname):
+            for filename in filenames:
+                filename = os.path.join(cwd, filename)
+                if filename.endswith('.prototxt') and 'solver' not in filename:
+                    yield os.path.join(dirname, filename)
+
+
+class TestDraw(unittest.TestCase):
+    def test_draw_net(self):
+        for filename in getFilenames():
+            net = caffe_pb2.NetParameter()
+            with open(filename) as infile:
+                text_format.Merge(infile.read(), net)
+            caffe.draw.draw_net(net, 'LR')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/caffe/test/test_nccl.py b/python/caffe/test/test_nccl.py
new file mode 100644
index 00000000000..127a9337040
--- /dev/null
+++ b/python/caffe/test/test_nccl.py
@@ -0,0 +1,19 @@
+import sys
+import unittest
+
+import caffe
+
+
+class TestNCCL(unittest.TestCase):
+
+    def test_newuid(self):
+        """
+        Test that NCCL uids are of the proper type
+        according to python version
+        """
+        if caffe.has_nccl():
+            uid = caffe.NCCL.new_uid()
+            if sys.version_info.major >= 3:
+                self.assertTrue(isinstance(uid, bytes))
+            else:
+                self.assertTrue(isinstance(uid, str))
diff --git a/python/caffe/test/test_net.py b/python/caffe/test/test_net.py
index 24391cc50c4..ee1d38c39db 100644
--- a/python/caffe/test/test_net.py
+++ b/python/caffe/test/test_net.py
@@ -25,11 +25,11 @@ def simple_net_file(num_output):
         bias_filler { type: 'constant' value: 2 } }
         param { decay_mult: 1 } param { decay_mult: 0 }
         }
-    layer { type: 'InnerProduct' name: 'ip' bottom: 'conv' top: 'ip'
+    layer { type: 'InnerProduct' name: 'ip' bottom: 'conv' top: 'ip_blob'
       inner_product_param { num_output: """ + str(num_output) + """
         weight_filler { type: 'gaussian' std: 2.5 }
         bias_filler { type: 'constant' value: -3 } } }
-    layer { type: 'SoftmaxWithLoss' name: 'loss' bottom: 'ip' bottom: 'label'
+    layer { type: 'SoftmaxWithLoss' name: 'loss' bottom: 'ip_blob' bottom: 'label'
       top: 'loss' }""")
     f.close()
     return f.name
@@ -71,6 +71,43 @@ def test_forward_backward(self):
         self.net.forward()
         self.net.backward()
 
+    def test_forward_start_end(self):
+        conv_blob=self.net.blobs['conv']
+        ip_blob=self.net.blobs['ip_blob']
+        sample_data=np.random.uniform(size=conv_blob.data.shape)
+        sample_data=sample_data.astype(np.float32)
+        conv_blob.data[:]=sample_data
+        forward_blob=self.net.forward(start='ip',end='ip')
+        self.assertIn('ip_blob',forward_blob)
+
+        manual_forward=[]
+        for i in range(0,conv_blob.data.shape[0]):
+          dot=np.dot(self.net.params['ip'][0].data,
+                     conv_blob.data[i].reshape(-1))
+          manual_forward.append(dot+self.net.params['ip'][1].data)
+        manual_forward=np.array(manual_forward)
+
+        np.testing.assert_allclose(ip_blob.data,manual_forward,rtol=1e-3,atol=1e-5)
+
+    def test_backward_start_end(self):
+        conv_blob=self.net.blobs['conv']
+        ip_blob=self.net.blobs['ip_blob']
+        sample_data=np.random.uniform(size=ip_blob.data.shape)
+        sample_data=sample_data.astype(np.float32)
+        ip_blob.diff[:]=sample_data
+        backward_blob=self.net.backward(start='ip',end='ip')
+        self.assertIn('conv',backward_blob)
+
+        manual_backward=[]
+        for i in range(0,conv_blob.data.shape[0]):
+          dot=np.dot(self.net.params['ip'][0].data.transpose(),
+                     sample_data[i].reshape(-1))
+          manual_backward.append(dot)
+        manual_backward=np.array(manual_backward)
+        manual_backward=manual_backward.reshape(conv_blob.data.shape)
+
+        np.testing.assert_allclose(conv_blob.diff,manual_backward,rtol=1e-3,atol=1e-5)
+
     def test_clear_param_diffs(self):
         # Run a forward/backward step to have non-zero diffs
         self.net.forward()
@@ -90,13 +127,13 @@ def test_top_bottom_names(self):
         self.assertEqual(self.net.top_names,
                          OrderedDict([('data', ['data', 'label']),
                                       ('conv', ['conv']),
-                                      ('ip', ['ip']),
+                                      ('ip', ['ip_blob']),
                                       ('loss', ['loss'])]))
         self.assertEqual(self.net.bottom_names,
                          OrderedDict([('data', []),
                                       ('conv', ['data']),
                                       ('ip', ['conv']),
-                                      ('loss', ['ip', 'label'])]))
+                                      ('loss', ['ip_blob', 'label'])]))
 
     def test_save_and_read(self):
         f = tempfile.NamedTemporaryFile(mode='w+', delete=False)
diff --git a/python/caffe/test/test_net_spec.py b/python/caffe/test/test_net_spec.py
index fee3c0aaebe..ffe71bacb08 100644
--- a/python/caffe/test/test_net_spec.py
+++ b/python/caffe/test/test_net_spec.py
@@ -79,3 +79,11 @@ def test_zero_tops(self):
         net_proto = silent_net()
         net = self.load_net(net_proto)
         self.assertEqual(len(net.forward()), 0)
+
+    def test_type_error(self):
+        """Test that a TypeError is raised when a Function input isn't a Top."""
+        data = L.DummyData(ntop=2)  # data is a 2-tuple of Tops
+        r = r"^Silence input 0 is not a Top \(type is <(type|class) 'tuple'>\)$"
+        with self.assertRaisesRegexp(TypeError, r):
+            L.Silence(data, ntop=0)  # should raise: data is a tuple, not a Top
+        L.Silence(*data, ntop=0)  # shouldn't raise: each elt of data is a Top
diff --git a/python/caffe/test/test_solver.py b/python/caffe/test/test_solver.py
index f618fded8cd..50c9d5412d7 100644
--- a/python/caffe/test/test_solver.py
+++ b/python/caffe/test/test_solver.py
@@ -38,6 +38,17 @@ def test_solve(self):
         self.solver.solve()
         self.assertEqual(self.solver.iter, 100)
 
+    def test_apply_update(self):
+        net = self.solver.net
+        data = net.layers[1].blobs[0].data[...]
+        # Reset the weights of that layer to 0
+        data[...] = 0
+        net.layers[1].blobs[0].diff[...] = 1
+        # Apply the update, the initial learning rate should be 0.01
+        self.solver.apply_update()
+        # Check that the new weights are -0.01, with a precision of 1e-7
+        self.assertTrue((data - -0.01 * np.ones(data.shape)).max() < 1e-7)
+
     def test_net_memory(self):
         """Check that nets survive after the solver is destroyed."""
 
diff --git a/python/draw_net.py b/python/draw_net.py
index dfe70d26a71..23cae30aef2 100755
--- a/python/draw_net.py
+++ b/python/draw_net.py
@@ -33,6 +33,10 @@ def parse_args():
                               'TEST, or ALL.  If ALL, then all layers are drawn '
                               'regardless of phase.'),
                         default="ALL")
+    parser.add_argument('--display_lrm', action='store_true',
+                        help=('Use this flag to visualize the learning rate '
+                              'multiplier, when non-zero, for the learning '
+                              'layers (Convolution, Deconvolution, InnerProduct).'))
 
     args = parser.parse_args()
     return args
@@ -51,7 +55,7 @@ def main():
     elif args.phase != "ALL":
         raise ValueError("Unknown phase: " + args.phase)
     caffe.draw.draw_net_to_file(net, args.output_image_file, args.rankdir,
-                                phase)
+                                phase, args.display_lrm)
 
 
 if __name__ == '__main__':
diff --git a/python/train.py b/python/train.py
index 5897f5dcb90..14a38b8cef1 100644
--- a/python/train.py
+++ b/python/train.py
@@ -63,8 +63,8 @@ def show_time():
 
 
 def solve(proto, snapshot, gpus, timing, uid, rank):
-    caffe.set_mode_gpu()
     caffe.set_device(gpus[rank])
+    caffe.set_mode_gpu()
     caffe.set_solver_count(len(gpus))
     caffe.set_solver_rank(rank)
     caffe.set_multiprocess(True)
diff --git a/scripts/caffe b/scripts/caffe
new file mode 100644
index 00000000000..8a0b22af6ac
--- /dev/null
+++ b/scripts/caffe
@@ -0,0 +1,73 @@
+# bash completion for Caffe's command line utility       -*- shell-script -*-
+# COPYRIGHT (C) 2015,2016 Zhou Mo <cdluminate@gmail.com>
+# License: BSD-2-Clause
+# Originally appeard at https://github.com/BVLC/caffe/issues/3149
+
+# Updated for caffe (1.0.0~rc3+20160715-g42cd785)
+_caffe()
+{
+  local cur prev words cword
+  _init_completion -s || return
+
+  local prototxts='@(prototxt)'
+  local caffemodels='@(caffemodel,binaryproto)'
+  local solverstates='@(solverstate)'
+  local caffefiles='@(prototxt|caffemodel|solverstate)'
+
+  local flags='-gpu -iterations -model -snapshot -solver -weights -sighup_effect -sigint_effect -level -stage -phase'
+  
+  if [[ $cword -eq 1 ]]; then
+    COMPREPLY=( $( compgen -W 'train test time device_query' -- "$cur" ) )
+    return 0
+  fi
+  
+  if [[ $cword -eq 2 ]]; then
+    case ${words[1]} in
+    train|test|device_query|time)
+      COMPREPLY=( $( compgen -W "$flags" -- "$cur") )
+      return 0
+      ;;
+    *)
+      return 0
+      ;;
+    esac
+  fi
+
+  case $prev in
+  -gpu|-iterations|-version|-level|-stage)
+    return 0
+    ;;
+  -solver|-model)
+    _filedir $prototxts
+    return 0
+    ;;
+  -weights)
+    _filedir $caffemodels
+    return 0
+    ;;
+  -snapshot)
+    _filedir $solverstates
+    return 0
+    ;;
+  -sighup_effect|-sigint_effect)
+    COMPREPLY=( $( compgen -W 'snapshot stop none' -- "$cur") )
+    return 0
+    ;;
+  -phase)
+    COMPREPLY=( $( compgen -W 'TRAIN TEST' -- "$cur") )
+    return 0
+    ;;
+  *)
+    COMPREPLY=( $( compgen -W "$flags" -- "$cur") )
+    return 0
+    ;;
+  esac
+
+  # file completion on relevant files
+  _filedir "$caffefiles"
+
+  return 0
+}
+complete -F _caffe caffe
+
+# vim
diff --git a/scripts/cpp_lint.py b/scripts/cpp_lint.py
index 6ec4fb76e2c..fb44026718e 100755
--- a/scripts/cpp_lint.py
+++ b/scripts/cpp_lint.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python2
+#!/usr/bin/env python
 #
 # Copyright (c) 2009 Google Inc. All rights reserved.
 #
@@ -52,6 +52,10 @@
 import sys
 import unicodedata
 
+import six
+
+from six import iteritems, itervalues
+from six.moves import xrange
 
 _USAGE = """
 Syntax: cpp_lint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
@@ -207,7 +211,7 @@
   'whitespace/todo'
   ]
 
-# The default state of the category filter. This is overrided by the --filter=
+# The default state of the category filter. This is overridden by the --filter=
 # flag. By default all errors are on, so only add here categories that should be
 # off by default (i.e., categories that must be enabled by the --filter= flags).
 # All entries here should start with a '-' or '+', as in the --filter= flag.
@@ -756,7 +760,7 @@ def IncrementErrorCount(self, category):
 
   def PrintErrorCounts(self):
     """Print a summary of errors by category, and the total."""
-    for category, count in self.errors_by_category.iteritems():
+    for category, count in iteritems(self.errors_by_category):
       sys.stderr.write('Category \'%s\' errors found: %d\n' %
                        (category, count))
     sys.stderr.write('Total errors found: %d\n' % self.error_count)
@@ -3444,16 +3448,16 @@ def GetLineWidth(line):
     The width of the line in column positions, accounting for Unicode
     combining characters and wide characters.
   """
-  if isinstance(line, unicode):
-    width = 0
-    for uc in unicodedata.normalize('NFC', line):
-      if unicodedata.east_asian_width(uc) in ('W', 'F'):
-        width += 2
-      elif not unicodedata.combining(uc):
-        width += 1
-    return width
-  else:
-    return len(line)
+  if six.PY2:
+    if isinstance(line, unicode):
+      width = 0
+      for uc in unicodedata.normalize('NFC', line):
+        if unicodedata.east_asian_width(uc) in ('W', 'F'):
+          width += 2
+        elif not unicodedata.combining(uc):
+          width += 1
+      return width
+  return len(line)
 
 
 def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
@@ -3774,7 +3778,7 @@ def _GetTextInside(text, start_pattern):
 
   # Give opening punctuations to get the matching close-punctuations.
   matching_punctuation = {'(': ')', '{': '}', '[': ']'}
-  closing_punctuation = set(matching_punctuation.itervalues())
+  closing_punctuation = set(itervalues(matching_punctuation))
 
   # Find the position to start extracting text.
   match = re.search(start_pattern, text, re.M)
@@ -4851,10 +4855,11 @@ def main():
 
   # Change stderr to write with replacement characters so we don't die
   # if we try to print something containing non-ASCII characters.
-  sys.stderr = codecs.StreamReaderWriter(sys.stderr,
-                                         codecs.getreader('utf8'),
-                                         codecs.getwriter('utf8'),
-                                         'replace')
+  if six.PY2:
+    sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                          codecs.getreader('utf8'),
+                                          codecs.getwriter('utf8'),
+                                          'replace')
 
   _cpplint_state.ResetErrorCounts()
   for filename in filenames:
diff --git a/scripts/travis/install-deps.sh b/scripts/travis/install-deps.sh
index 1900b16df54..abf9cf1ca70 100755
--- a/scripts/travis/install-deps.sh
+++ b/scripts/travis/install-deps.sh
@@ -8,6 +8,7 @@ source $BASEDIR/defaults.sh
 apt-get -y update
 apt-get install -y --no-install-recommends \
   build-essential \
+  graphviz \
   libboost-filesystem-dev \
   libboost-python-dev \
   libboost-system-dev \
@@ -31,6 +32,7 @@ if ! $WITH_PYTHON3 ; then
     python-dev \
     python-numpy \
     python-protobuf \
+    python-pydot \
     python-skimage
 else
   # Python3
@@ -104,7 +106,7 @@ if $WITH_CUDA ; then
   ln -s /usr/local/cuda-$CUDA_VERSION /usr/local/cuda
 
   if $WITH_CUDNN ; then
-    apt-get install -y --no-install-recommends libcudnn5-dev
+    apt-get install -y --no-install-recommends libcudnn7-dev
   fi
 fi
 
diff --git a/scripts/travis/install-python-deps.sh b/scripts/travis/install-python-deps.sh
index eeec302791f..910d35a93be 100755
--- a/scripts/travis/install-python-deps.sh
+++ b/scripts/travis/install-python-deps.sh
@@ -11,4 +11,5 @@ if ! $WITH_PYTHON3 ; then
 else
   # Python3
   pip install --pre protobuf==3.0.0b3
+  pip install pydot
 fi
diff --git a/src/caffe/CMakeLists.txt b/src/caffe/CMakeLists.txt
index 7b25a98aa2d..4a805568566 100644
--- a/src/caffe/CMakeLists.txt
+++ b/src/caffe/CMakeLists.txt
@@ -3,12 +3,12 @@ file(GLOB proto_files proto/*.proto)
 caffe_protobuf_generate_cpp_py(${proto_gen_folder} proto_srcs proto_hdrs proto_python ${proto_files})
 
 # include python files either to force generation
-add_library(proto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
-caffe_default_properties(proto)
-target_link_libraries(proto PUBLIC ${PROTOBUF_LIBRARIES})
-target_include_directories(proto PUBLIC ${PROTOBUF_INCLUDE_DIR})
+add_library(caffeproto STATIC ${proto_hdrs} ${proto_srcs} ${proto_python})
+caffe_default_properties(caffeproto)
+target_link_libraries(caffeproto PUBLIC ${PROTOBUF_LIBRARIES})
+target_include_directories(caffeproto PUBLIC ${PROTOBUF_INCLUDE_DIR})
 
-list(INSERT Caffe_LINKER_LIBS 0 PUBLIC proto) # note, crucial to prepend!
+list(INSERT Caffe_LINKER_LIBS 0 PUBLIC caffeproto) # note, crucial to prepend!
 
 # --[ Caffe library
 
@@ -40,9 +40,9 @@ set_target_properties(caffe PROPERTIES
  add_subdirectory(test)
 
 # ---[ Install
-install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION include)
-install(FILES ${proto_hdrs} DESTINATION include/caffe/proto)
-install(TARGETS caffe proto EXPORT CaffeTargets DESTINATION lib)
+install(DIRECTORY ${Caffe_INCLUDE_DIR}/caffe DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(FILES ${proto_hdrs} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/caffe/proto)
+install(TARGETS caffe caffeproto EXPORT CaffeTargets DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
 file(WRITE ${PROJECT_BINARY_DIR}/__init__.py)
 list(APPEND proto_python ${PROJECT_BINARY_DIR}/__init__.py)
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index f14253a510e..d9984431ace 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -7,7 +7,9 @@
 
 #include "caffe/layer.hpp"
 #include "caffe/layer_factory.hpp"
+#include "caffe/layers/clip_layer.hpp"
 #include "caffe/layers/conv_layer.hpp"
+#include "caffe/layers/deconv_layer.hpp"
 #include "caffe/layers/lrn_layer.hpp"
 #include "caffe/layers/pooling_layer.hpp"
 #include "caffe/layers/relu_layer.hpp"
@@ -18,6 +20,7 @@
 
 #ifdef USE_CUDNN
 #include "caffe/layers/cudnn_conv_layer.hpp"
+#include "caffe/layers/cudnn_deconv_layer.hpp"
 #include "caffe/layers/cudnn_lcn_layer.hpp"
 #include "caffe/layers/cudnn_lrn_layer.hpp"
 #include "caffe/layers/cudnn_pooling_layer.hpp"
@@ -73,6 +76,45 @@ shared_ptr<Layer<Dtype> > GetConvolutionLayer(
 
 REGISTER_LAYER_CREATOR(Convolution, GetConvolutionLayer);
 
+// Get deconvolution layer according to engine.
+template <typename Dtype>
+shared_ptr<Layer<Dtype> > GetDeconvolutionLayer(const LayerParameter& param) {
+  ConvolutionParameter conv_param = param.convolution_param();
+  ConvolutionParameter_Engine engine = conv_param.engine();
+#ifdef USE_CUDNN
+  bool use_dilation = false;
+  for (int i = 0; i < conv_param.dilation_size(); ++i) {
+    if (conv_param.dilation(i) > 1) {
+      use_dilation = true;
+    }
+  }
+#endif
+  if (engine == ConvolutionParameter_Engine_DEFAULT) {
+    engine = ConvolutionParameter_Engine_CAFFE;
+#ifdef USE_CUDNN
+    if (!use_dilation) {
+      engine = ConvolutionParameter_Engine_CUDNN;
+    }
+#endif
+  }
+  if (engine == ConvolutionParameter_Engine_CAFFE) {
+    return shared_ptr<Layer<Dtype> >(new DeconvolutionLayer<Dtype>(param));
+#ifdef USE_CUDNN
+  } else if (engine == ConvolutionParameter_Engine_CUDNN) {
+    if (use_dilation) {
+      LOG(FATAL) << "CuDNN doesn't support the dilated deconvolution at Layer "
+                 << param.name();
+    }
+    return shared_ptr<Layer<Dtype> >(new CuDNNDeconvolutionLayer<Dtype>(param));
+#endif
+  } else {
+    LOG(FATAL) << "Layer " << param.name() << " has unknown engine.";
+    throw;  // Avoids missing return warning
+  }
+}
+
+REGISTER_LAYER_CREATOR(Deconvolution, GetDeconvolutionLayer);
+
 // Get pooling layer according to engine.
 template <typename Dtype>
 shared_ptr<Layer<Dtype> > GetPoolingLayer(const LayerParameter& param) {
diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
index 4eddbb5c850..b6d95b54a5d 100644
--- a/src/caffe/layers/accuracy_layer.cpp
+++ b/src/caffe/layers/accuracy_layer.cpp
@@ -52,8 +52,6 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   const Dtype* bottom_label = bottom[1]->cpu_data();
   const int dim = bottom[0]->count() / outer_num_;
   const int num_labels = bottom[0]->shape(label_axis_);
-  vector<Dtype> maxval(top_k_+1);
-  vector<int> max_id(top_k_+1);
   if (top.size() > 1) {
     caffe_set(nums_buffer_.count(), Dtype(0), nums_buffer_.mutable_cpu_data());
     caffe_set(top[1]->count(), Dtype(0), top[1]->mutable_cpu_data());
@@ -66,32 +64,29 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       if (has_ignore_label_ && label_value == ignore_label_) {
         continue;
       }
-      if (top.size() > 1) ++nums_buffer_.mutable_cpu_data()[label_value];
       DCHECK_GE(label_value, 0);
       DCHECK_LT(label_value, num_labels);
+      if (top.size() > 1) ++nums_buffer_.mutable_cpu_data()[label_value];
+      const Dtype prob_of_true_class = bottom_data[i * dim
+                                                   + label_value * inner_num_
+                                                   + j];
+      int num_better_predictions = -1;  // true_class also counts as "better"
       // Top-k accuracy
-      std::vector<std::pair<Dtype, int> > bottom_data_vector;
-      for (int k = 0; k < num_labels; ++k) {
-        bottom_data_vector.push_back(std::make_pair(
-            bottom_data[i * dim + k * inner_num_ + j], k));
+      for (int k = 0; k < num_labels && num_better_predictions < top_k_; ++k) {
+        num_better_predictions +=
+          (bottom_data[i * dim + k * inner_num_ + j] >= prob_of_true_class);
       }
-      std::partial_sort(
-          bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
-          bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
-      // check if true label is in top k predictions
-      for (int k = 0; k < top_k_; k++) {
-        if (bottom_data_vector[k].second == label_value) {
-          ++accuracy;
-          if (top.size() > 1) ++top[1]->mutable_cpu_data()[label_value];
-          break;
-        }
+      // check if there are less than top_k_ predictions
+      if (num_better_predictions < top_k_) {
+        ++accuracy;
+        if (top.size() > 1) ++top[1]->mutable_cpu_data()[label_value];
       }
       ++count;
     }
   }
 
   // LOG(INFO) << "Accuracy: " << accuracy;
-  top[0]->mutable_cpu_data()[0] = accuracy / count;
+  top[0]->mutable_cpu_data()[0] = (count == 0) ? 0 : (accuracy / count);
   if (top.size() > 1) {
     for (int i = 0; i < top[1]->count(); ++i) {
       top[1]->mutable_cpu_data()[i] =
@@ -102,6 +97,10 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   // Accuracy layer should not be used as a loss function.
 }
 
+#ifdef CPU_ONLY
+STUB_GPU(AccuracyLayer);
+#endif
+
 INSTANTIATE_CLASS(AccuracyLayer);
 REGISTER_LAYER_CLASS(Accuracy);
 
diff --git a/src/caffe/layers/accuracy_layer.cu b/src/caffe/layers/accuracy_layer.cu
new file mode 100644
index 00000000000..904aab422af
--- /dev/null
+++ b/src/caffe/layers/accuracy_layer.cu
@@ -0,0 +1,148 @@
+#include <vector>
+
+#include "caffe/layers/accuracy_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+
+namespace caffe {
+
+template <typename Dtype>
+__global__ void AccuracyForwardGPU(const int nthreads,
+          const Dtype* bottom_data, const Dtype* label, Dtype* acc,
+          const int num, const int dim, const int spatial_dim,
+          const int num_labels, const int top_k,
+          const bool has_ignore_label_, const int ignore_label_,
+          Dtype* counts) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / spatial_dim;
+    const int s = index % spatial_dim;
+    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+    const Dtype prob_of_true_class = bottom_data[n * dim
+                                                 + label_value * spatial_dim
+                                                 + s];
+    int num_better_predictions = -1;  // true_class also counts as "better"
+    if (has_ignore_label_ && label_value == ignore_label_) {
+      acc[index] = 0;
+      counts[index] = 0;
+    } else {
+      for (int k = 0; k < num_labels & num_better_predictions < top_k; k++) {
+        num_better_predictions +=
+          (bottom_data[n * dim + k * spatial_dim + s] >= prob_of_true_class);
+      }
+      acc[index] = (num_better_predictions < top_k);
+      counts[index] = 1;
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void AccuracyForwardWithPerClassGPU(const int nthreads,
+          const Dtype* bottom_data, const Dtype* label,
+          Dtype* acc, Dtype* counts,
+          const int num, const int dim, const int spatial_dim,
+          const int num_labels, const int top_k,
+          const bool has_ignore_label_, const int ignore_label_) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int n = index / spatial_dim;
+    const int s = index % spatial_dim;
+    const int label_value = static_cast<int>(label[n * spatial_dim + s]);
+    const Dtype prob_of_true_class = bottom_data[n * dim
+                                                 + label_value * spatial_dim
+                                                 + s];
+    if (has_ignore_label_ && label_value == ignore_label_) {
+      // nothing to be done.
+    } else {
+      int num_better_predictions = -1;  // true_class also counts as "better"
+      for (int k = 0; k < num_labels & num_better_predictions < top_k; k++) {
+        num_better_predictions +=
+          (bottom_data[n * dim + k * spatial_dim + s] >= prob_of_true_class);
+      }
+      acc[label_value*nthreads + index] += (num_better_predictions < top_k);
+      counts[label_value*nthreads + index] = 1;
+    }
+  }
+}
+
+template <typename Dtype>
+void AccuracyLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  const Dtype* bottom_label = bottom[1]->gpu_data();
+  const int dim = bottom[0]->count() / outer_num_;
+  const int num_labels = bottom[0]->shape(label_axis_);
+  const int nthreads = outer_num_ * inner_num_;
+  // Since this memory is not used for anything, we use it here to avoid having
+  // to allocate new GPU memory to accumulate intermediate results.
+  Dtype* acc_data = bottom[0]->mutable_gpu_diff();
+  if (top.size() == 1) {
+    // simple case - report only global accuracy.
+
+    // Similarly, this memory is never used elsewhere, and thus we can use it
+    // to avoid having to allocate additional GPU memory.
+    Dtype* counts = bottom[1]->mutable_gpu_diff();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    AccuracyForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
+        CAFFE_CUDA_NUM_THREADS>>>(nthreads, bottom_data, bottom_label,
+        acc_data, outer_num_, dim, inner_num_, num_labels, top_k_,
+        has_ignore_label_, ignore_label_, counts);
+    Dtype acc;
+    caffe_gpu_asum(nthreads, acc_data, &acc);
+    Dtype valid_count;
+    caffe_gpu_asum(nthreads, counts, &valid_count);
+    if (valid_count > 0) {
+      top[0]->mutable_cpu_data()[0] = acc / valid_count;
+    } else {
+      top[0]->mutable_cpu_data()[0] = 0;
+    }
+  } else {
+    // need to report per-class accuracy as well
+
+    // allocate space for more detailed "counts"
+    nums_buffer_.ReshapeLike(*bottom[0]);
+    Dtype* counts = nums_buffer_.mutable_gpu_data();
+
+    caffe_gpu_set(bottom[0]->count(), Dtype(0), acc_data);
+    caffe_gpu_set(nums_buffer_.count(), Dtype(0), counts);
+
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    AccuracyForwardWithPerClassGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
+        CAFFE_CUDA_NUM_THREADS>>>(nthreads, bottom_data, bottom_label,
+        acc_data, counts, outer_num_, dim, inner_num_, num_labels, top_k_,
+        has_ignore_label_, ignore_label_);
+
+    // get the overall accuracy
+    Dtype acc;
+    caffe_gpu_asum(bottom[0]->count(), acc_data, &acc);
+    Dtype valid_count;
+    caffe_gpu_asum(nums_buffer_.count(), counts, &valid_count);
+    if (valid_count > 0) {
+      top[0]->mutable_cpu_data()[0] = acc / valid_count;
+    } else {
+      top[0]->mutable_cpu_data()[0] = 0;
+    }
+
+    // get per-class accuracy
+    Dtype* per_class_acc = top[1]->mutable_cpu_data();
+    for (int l = 0; l < num_labels; l++) {
+      caffe_gpu_asum(nthreads, acc_data + l*nthreads, per_class_acc+l);
+      caffe_gpu_asum(nthreads, counts + l*nthreads, &valid_count);
+      if (valid_count > 0) {
+        per_class_acc[l] /= valid_count;
+      } else {
+        per_class_acc[l] = 0;
+      }
+    }
+  }
+  // Clear scratch memory to prevent interfering with backward (see #6202).
+  caffe_gpu_set(bottom[0]->count(), Dtype(0), bottom[0]->mutable_gpu_diff());
+}
+
+
+template <typename Dtype>
+void AccuracyLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[1]) {  NOT_IMPLEMENTED;  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(AccuracyLayer);
+}  // namespace caffe
diff --git a/src/caffe/layers/base_conv_layer.cpp b/src/caffe/layers/base_conv_layer.cpp
index 4a4c68e009a..aff0a7548c7 100644
--- a/src/caffe/layers/base_conv_layer.cpp
+++ b/src/caffe/layers/base_conv_layer.cpp
@@ -19,7 +19,6 @@ void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   const int num_axes = bottom[0]->num_axes();
   num_spatial_axes_ = num_axes - first_spatial_axis;
   CHECK_GE(num_spatial_axes_, 0);
-  vector<int> bottom_dim_blob_shape(1, num_spatial_axes_ + 1);
   vector<int> spatial_dim_blob_shape(1, std::max(num_spatial_axes_, 1));
   // Setup filter kernel dimensions (kernel_shape_).
   kernel_shape_.Reshape(spatial_dim_blob_shape);
@@ -194,7 +193,9 @@ void BaseConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   // TODO: generalize to handle inputs of different shapes.
   for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
     CHECK(bottom[0]->shape() == bottom[bottom_id]->shape())
-        << "All inputs must have the same shape.";
+        << "shape mismatch - bottom[0]: " << bottom[0]->shape_string()
+        << " vs. bottom[" << bottom_id << "]: "
+        << bottom[bottom_id]->shape_string();
   }
   // Shape the tops.
   bottom_shape_ = &bottom[0]->shape();
diff --git a/src/caffe/layers/batch_norm_layer.cpp b/src/caffe/layers/batch_norm_layer.cpp
index 0a08ed4cb07..c6a1d5b1b2c 100644
--- a/src/caffe/layers/batch_norm_layer.cpp
+++ b/src/caffe/layers/batch_norm_layer.cpp
@@ -124,8 +124,8 @@ void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
   if (!use_global_stats_) {
     // compute variance using var(X) = E((X-EX)^2)
-    caffe_powx(top[0]->count(), top_data, Dtype(2),
-        temp_.mutable_cpu_data());  // (X-EX)^2
+    caffe_sqr<Dtype>(top[0]->count(), top_data,
+                     temp_.mutable_cpu_data());  // (X-EX)^2
     caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
         1. / (num * spatial_dim), temp_.cpu_data(),
         spatial_sum_multiplier_.cpu_data(), 0.,
@@ -148,7 +148,7 @@ void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 
   // normalize variance
   caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
-  caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5),
+  caffe_sqrt(variance_.count(), variance_.cpu_data(),
              variance_.mutable_cpu_data());
 
   // replicate variance to input size
diff --git a/src/caffe/layers/batch_norm_layer.cu b/src/caffe/layers/batch_norm_layer.cu
index c21713c81d9..a35e778e2f1 100644
--- a/src/caffe/layers/batch_norm_layer.cu
+++ b/src/caffe/layers/batch_norm_layer.cu
@@ -48,14 +48,14 @@ void BatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 
   if (!use_global_stats_) {
     // compute variance using var(X) = E((X-EX)^2)
-    caffe_gpu_powx(top[0]->count(), top_data, Dtype(2),
+    caffe_gpu_mul(top[0]->count(), top[0]->gpu_data(), top[0]->gpu_data(),
         temp_.mutable_gpu_data());  // (X-EX)^2
     caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
         1. / (num * spatial_dim), temp_.gpu_data(),
         spatial_sum_multiplier_.gpu_data(), 0.,
         num_by_chans_.mutable_gpu_data());
-    caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
-        num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
+    caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, Dtype(1.),
+        num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), Dtype(0.),
         variance_.mutable_gpu_data());  // E((X_EX)^2)
 
     // compute and save moving average
@@ -72,7 +72,7 @@ void BatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 
   // normalize variance
   caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
-  caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
+  caffe_gpu_sqrt(variance_.count(), variance_.gpu_data(),
       variance_.mutable_gpu_data());
 
   // replicate variance to input size
diff --git a/src/caffe/layers/clip_layer.cpp b/src/caffe/layers/clip_layer.cpp
new file mode 100644
index 00000000000..9d9a59673c0
--- /dev/null
+++ b/src/caffe/layers/clip_layer.cpp
@@ -0,0 +1,51 @@
+#include <algorithm>
+#include <vector>
+
+#include "caffe/layers/clip_layer.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void ClipLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  const int count = bottom[0]->count();
+
+  Dtype min = this->layer_param_.clip_param().min();
+  Dtype max = this->layer_param_.clip_param().max();
+
+  for (int i = 0; i < count; ++i) {
+    top_data[i] = std::max(min, std::min(bottom_data[i], max));
+  }
+}
+
+template <typename Dtype>
+void ClipLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const int count = bottom[0]->count();
+
+    Dtype min = this->layer_param_.clip_param().min();
+    Dtype max = this->layer_param_.clip_param().max();
+
+    for (int i = 0; i < count; ++i) {
+      bottom_diff[i] = top_diff[i] * (
+              bottom_data[i] >= min && bottom_data[i] <= max);
+    }
+  }
+}
+
+
+#ifdef CPU_ONLY
+STUB_GPU(ClipLayer);
+#endif
+
+INSTANTIATE_CLASS(ClipLayer);
+REGISTER_LAYER_CLASS(Clip);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/clip_layer.cu b/src/caffe/layers/clip_layer.cu
new file mode 100644
index 00000000000..56f3be32d7d
--- /dev/null
+++ b/src/caffe/layers/clip_layer.cu
@@ -0,0 +1,67 @@
+#include <vector>
+
+#include "caffe/layers/clip_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+__global__ void ClipForward(const int n, const float* in, float* out,
+    float p_min, float p_max) {
+  CUDA_KERNEL_LOOP(index, n) {
+    out[index] = fmaxf(p_min, fminf(in[index], p_max));
+  }
+}
+
+__global__ void ClipForward(const int n, const double* in, double* out,
+    double p_min, double p_max) {
+  CUDA_KERNEL_LOOP(index, n) {
+    out[index] = fmax(p_min, fmin(in[index], p_max));
+  }
+}
+
+template <typename Dtype>
+void ClipLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  Dtype p_min = this->layer_param_.clip_param().min();
+  Dtype p_max = this->layer_param_.clip_param().max();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  ClipForward<<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+      count, bottom_data, top_data, p_min, p_max);
+  CUDA_POST_KERNEL_CHECK;
+}
+
+template <typename Dtype>
+__global__ void ClipBackward(const int n, const Dtype* in_diff,
+    const Dtype* in_data, Dtype* out_diff, Dtype p_min, Dtype p_max) {
+  CUDA_KERNEL_LOOP(index, n) {
+    out_diff[index] = in_diff[index] * (
+            in_data[index] >= p_min && in_data[index] <= p_max);
+  }
+}
+
+template <typename Dtype>
+void ClipLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* bottom_data = bottom[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    Dtype p_min = this->layer_param_.clip_param().min();
+    Dtype p_max = this->layer_param_.clip_param().max();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    ClipBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, top_diff, bottom_data, bottom_diff, p_min, p_max);
+    CUDA_POST_KERNEL_CHECK;
+  }
+}
+
+
+INSTANTIATE_LAYER_GPU_FUNCS(ClipLayer);
+
+
+}  // namespace caffe
diff --git a/src/caffe/layers/crop_layer.cpp b/src/caffe/layers/crop_layer.cpp
index ef8c177c4dd..65ea8f8b7d0 100644
--- a/src/caffe/layers/crop_layer.cpp
+++ b/src/caffe/layers/crop_layer.cpp
@@ -40,8 +40,10 @@ void CropLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   const int start_axis = bottom[0]->CanonicalAxisIndex(param.axis());
 
   // Initialize offsets to 0 and the new shape to the current shape of the data.
-  offsets = vector<int>(input_dim, 0);
   vector<int> new_shape(bottom[0]->shape());
+  vector<int> offsets_shape(1, input_dim);
+  offsets.Reshape(offsets_shape);
+  int* offset_data = offsets.mutable_cpu_data();
 
   // Determine crop offsets and the new shape post-crop.
   for (int i = 0; i < input_dim; ++i) {
@@ -63,15 +65,22 @@ void CropLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
           << "size " << bottom[1]->shape(i) << " and offset " << crop_offset;
     }
     new_shape[i] = new_size;
-    offsets[i] = crop_offset;
+    offset_data[i] = crop_offset;
   }
   top[0]->Reshape(new_shape);
+  // Compute strides
+  src_strides_.Reshape(offsets_shape);
+  dest_strides_.Reshape(offsets_shape);
+  for (int i = 0; i < input_dim; ++i) {
+    src_strides_.mutable_cpu_data()[i] = bottom[0]->count(i + 1, input_dim);
+    dest_strides_.mutable_cpu_data()[i] = top[0]->count(i + 1, input_dim);
+  }
 }
 
 template <typename Dtype>
 void CropLayer<Dtype>::crop_copy(const vector<Blob<Dtype>*>& bottom,
              const vector<Blob<Dtype>*>& top,
-             const vector<int>& offsets,
+             const int* offsets,
              vector<int> indices,
              int cur_dim,
              const Dtype* src_data,
@@ -115,7 +124,8 @@ void CropLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   std::vector<int> indices(top[0]->num_axes(), 0);
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
-  crop_copy(bottom, top, offsets, indices, 0, bottom_data, top_data, true);
+  crop_copy(bottom, top, offsets.cpu_data(), indices, 0, bottom_data, top_data,
+      true);
 }
 
 template <typename Dtype>
@@ -127,7 +137,8 @@ void CropLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
   if (propagate_down[0]) {
     caffe_set(bottom[0]->count(), static_cast<Dtype>(0), bottom_diff);
     std::vector<int> indices(top[0]->num_axes(), 0);
-    crop_copy(bottom, top, offsets, indices, 0, top_diff, bottom_diff, false);
+    crop_copy(bottom, top, offsets.cpu_data(), indices, 0, top_diff,
+        bottom_diff, false);
   }
 }
 
diff --git a/src/caffe/layers/crop_layer.cu b/src/caffe/layers/crop_layer.cu
index 677077cdd8b..4ece9cd1761 100644
--- a/src/caffe/layers/crop_layer.cu
+++ b/src/caffe/layers/crop_layer.cu
@@ -4,90 +4,63 @@
 
 namespace caffe {
 
-// Copy (one line per thread) from one array to another, with arbitrary
-// strides in the last two dimensions.
+__device__ int compute_uncropped_index(
+    int index,
+    const int ndims,
+    const int* src_strides,
+    const int* dest_strides,
+    const int* offsets) {
+  int dest_index = index;
+  int src_index = 0;
+  for (int i = 0; i < ndims; ++i) {
+      int coord = dest_index / dest_strides[i];
+      dest_index -= coord * dest_strides[i];
+      src_index += src_strides[i] * (coord + offsets[i]);
+  }
+  return src_index;
+}
+
 template <typename Dtype>
-__global__ void copy_kernel(const int n, const int height, const int width,
-    const int src_inner_stride,
-    const int dest_inner_stride,
+__global__ void crop_kernel_forward(const int nthreads,
+    const int ndims,
+    const int* src_strides,
+    const int* dest_strides,
+    const int* offsets,
     const Dtype* src, Dtype* dest) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int src_start = index * src_inner_stride;
-    int dest_start = index * dest_inner_stride;
-    for (int i = 0; i < width; ++i) {
-      dest[dest_start + i] = src[src_start + i];
-    }
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int src_index = compute_uncropped_index(
+        index, ndims, src_strides, dest_strides, offsets);
+    dest[index] = src[src_index];
   }
 }
 
 template <typename Dtype>
-void CropLayer<Dtype>::crop_copy_gpu(const vector<Blob<Dtype>*>& bottom,
-             const vector<Blob<Dtype>*>& top,
-             const vector<int>& offsets,
-             vector<int> indices,
-             int cur_dim,
-             const Dtype* src_data,
-             Dtype* dest_data,
-             bool is_forward) {
-  if (cur_dim + 2 < top[0]->num_axes()) {
-    // We are not yet at the final dimension, call copy recursivley
-    for (int i = 0; i < top[0]->shape(cur_dim); ++i) {
-      indices[cur_dim] = i;
-      crop_copy_gpu(bottom, top, offsets, indices, cur_dim+1,
-                src_data, dest_data, is_forward);
-    }
-  } else {
-    // We are at the last two dimensions, which are stored continuously in
-    // memory. With (N,C,H,W)
-    //              (0,1,2,3) cur_dim   -> H
-    //                        cur_dim+1 -> W
-    const int lines = top[0]->shape(cur_dim);
-    const int height = top[0]->shape(cur_dim);
-    const int width = top[0]->shape(cur_dim+1);
-    std::vector<int> ind_off(cur_dim+2, 0);
-    for (int j = 0; j < cur_dim; ++j) {
-        ind_off[j] = indices[j] + offsets[j];
-    }
-    ind_off[cur_dim] = offsets[cur_dim];
-    ind_off[cur_dim+1] = offsets[cur_dim+1];
-    // Compute copy strides
-    const int src_inner_stride = bottom[0]->shape(cur_dim+1);
-    const int dest_inner_stride = top[0]->shape(cur_dim+1);
-
-    if (is_forward) {
-      const Dtype* bottom_data = bottom[0]->gpu_data() +
-          bottom[0]->offset(ind_off);
-      Dtype* top_data = top[0]->mutable_gpu_data() +
-          top[0]->offset(indices);
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
-          lines, height, width,
-          src_inner_stride,
-          dest_inner_stride,
-          bottom_data, top_data);
-
-    } else {
-      const Dtype* top_diff = top[0]->gpu_diff() +
-          top[0]->offset(indices);
-      Dtype* bottom_diff = bottom[0]->mutable_gpu_diff() +
-          bottom[0]->offset(ind_off);
-      // NOLINT_NEXT_LINE(whitespace/operators)
-      copy_kernel<<<CAFFE_GET_BLOCKS(lines), CAFFE_CUDA_NUM_THREADS>>>(
-          lines, height, width,
-          dest_inner_stride,
-          src_inner_stride,
-          top_diff, bottom_diff);
-    }
+__global__ void crop_kernel_backward(const int nthreads,
+    const int ndims,
+    const int* src_strides,
+    const int* dest_strides,
+    const int* offsets,
+    Dtype* src, const Dtype* dest) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    int src_index = compute_uncropped_index(
+        index, ndims, src_strides, dest_strides, offsets);
+    src[src_index] = dest[index];
   }
 }
 
 template <typename Dtype>
 void CropLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  std::vector<int> indices(top[0]->num_axes(), 0);
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
-  crop_copy_gpu(bottom, top, offsets, indices, 0, bottom_data, top_data, true);
+  int n = top[0]->count();
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  crop_kernel_forward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
+      bottom[0]->num_axes(),
+      src_strides_.gpu_data(),
+      dest_strides_.gpu_data(),
+      offsets.gpu_data(),
+      bottom_data, top_data);
 }
 
 template <typename Dtype>
@@ -95,12 +68,17 @@ void CropLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* top_diff = top[0]->gpu_diff();
   Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+  int n = top[0]->count();
 
   if (propagate_down[0]) {
     caffe_gpu_set(bottom[0]->count(), static_cast<Dtype>(0), bottom_diff);
-    std::vector<int> indices(top[0]->num_axes(), 0);
-    crop_copy_gpu(bottom, top, offsets, indices, 0, top_diff, bottom_diff,
-                  false);
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    crop_kernel_backward<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS>>>(n,
+        bottom[0]->num_axes(),
+        src_strides_.gpu_data(),
+        dest_strides_.gpu_data(),
+        offsets.gpu_data(),
+        bottom_diff, top_diff);
   }
 }
 
diff --git a/src/caffe/layers/cudnn_conv_layer.cpp b/src/caffe/layers/cudnn_conv_layer.cpp
index 1987fb096b0..efc9e04e8c0 100644
--- a/src/caffe/layers/cudnn_conv_layer.cpp
+++ b/src/caffe/layers/cudnn_conv_layer.cpp
@@ -252,6 +252,7 @@ CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() {
   }
 
   cudaFree(workspaceData);
+  delete [] workspace;
   delete [] stream_;
   delete [] handle_;
   delete [] fwd_algo_;
diff --git a/src/caffe/layers/cudnn_deconv_layer.cpp b/src/caffe/layers/cudnn_deconv_layer.cpp
new file mode 100644
index 00000000000..260da5c1ee0
--- /dev/null
+++ b/src/caffe/layers/cudnn_deconv_layer.cpp
@@ -0,0 +1,327 @@
+#ifdef USE_CUDNN
+#include <algorithm>
+#include <vector>
+
+#include "caffe/layers/cudnn_deconv_layer.hpp"
+
+namespace caffe {
+
+// Set to three for the benefit of the backward pass, which
+// can use separate streams for calculating the gradient w.r.t.
+// bias, filter weights, and bottom data for each group independently
+#define CUDNN_STREAMS_PER_GROUP 3
+
+/**
+ * TODO(dox) explain cuDNN interface
+ */
+template <typename Dtype>
+void CuDNNDeconvolutionLayer<Dtype>::LayerSetUp(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  DeconvolutionLayer<Dtype>::LayerSetUp(bottom, top);
+  // Initialize CUDA streams and cuDNN.
+  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
+  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
+
+  // Initialize algorithm arrays
+  fwd_algo_       = new cudnnConvolutionFwdAlgo_t[bottom.size()];
+  bwd_filter_algo_= new cudnnConvolutionBwdFilterAlgo_t[bottom.size()];
+  bwd_data_algo_  = new cudnnConvolutionBwdDataAlgo_t[bottom.size()];
+
+  // initialize size arrays
+  workspace_fwd_sizes_ = new size_t[bottom.size()];
+  workspace_bwd_filter_sizes_ = new size_t[bottom.size()];
+  workspace_bwd_data_sizes_ = new size_t[bottom.size()];
+
+  // workspace data
+  workspaceSizeInBytes = 0;
+  workspaceData = NULL;
+  workspace = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP];
+
+  for (size_t i = 0; i < bottom.size(); ++i) {
+    // initialize all to default algorithms
+    fwd_algo_[i] = (cudnnConvolutionFwdAlgo_t)0;
+    bwd_filter_algo_[i] = (cudnnConvolutionBwdFilterAlgo_t)0;
+    bwd_data_algo_[i] = (cudnnConvolutionBwdDataAlgo_t)0;
+    // default algorithms don't require workspace
+    workspace_fwd_sizes_[i] = 0;
+    workspace_bwd_data_sizes_[i] = 0;
+    workspace_bwd_filter_sizes_[i] = 0;
+  }
+
+  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
+    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
+    CUDNN_CHECK(cudnnCreate(&handle_[g]));
+    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
+    workspace[g] = NULL;
+  }
+
+  // Set the indexing parameters.
+  bias_offset_ = (this->num_output_ / this->group_);
+
+  // Create filter descriptor.
+  const int* kernel_shape_data = this->kernel_shape_.cpu_data();
+  const int kernel_h = kernel_shape_data[0];
+  const int kernel_w = kernel_shape_data[1];
+  cudnn::createFilterDesc<Dtype>(&filter_desc_,
+                                 this->channels_ / this->group_,
+                                 this->num_output_ / this->group_,
+                                 kernel_h,
+                                 kernel_w);
+
+  // Create tensor descriptor(s) for data and corresponding convolution(s).
+  for (int i = 0; i < bottom.size(); i++) {
+    cudnnTensorDescriptor_t bottom_desc;
+    cudnn::createTensor4dDesc<Dtype>(&bottom_desc);
+    bottom_descs_.push_back(bottom_desc);
+    cudnnTensorDescriptor_t top_desc;
+    cudnn::createTensor4dDesc<Dtype>(&top_desc);
+    top_descs_.push_back(top_desc);
+    cudnnConvolutionDescriptor_t conv_desc;
+    cudnn::createConvolutionDesc<Dtype>(&conv_desc);
+    conv_descs_.push_back(conv_desc);
+  }
+
+  // Tensor descriptor for bias.
+  if (this->bias_term_) {
+    cudnn::createTensor4dDesc<Dtype>(&bias_desc_);
+  }
+
+  handles_setup_ = true;
+}
+
+template <typename Dtype>
+void CuDNNDeconvolutionLayer<Dtype>::Reshape(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  DeconvolutionLayer<Dtype>::Reshape(bottom, top);
+  CHECK_EQ(2, this->num_spatial_axes_)
+      << "CuDNNDeconvolutionLayer input must have 2 spatial axes "
+      << "(e.g., height and width). "
+      << "Use 'engine: CAFFE' for general ND convolution.";
+  bottom_offset_ = this->bottom_dim_ / this->group_;
+  top_offset_ = this->top_dim_ / this->group_;
+  const int height = bottom[0]->shape(this->channel_axis_ + 1);
+  const int width = bottom[0]->shape(this->channel_axis_ + 2);
+  const int height_out = top[0]->shape(this->channel_axis_ + 1);
+  const int width_out = top[0]->shape(this->channel_axis_ + 2);
+  const int* pad_data = this->pad_.cpu_data();
+  const int pad_h = pad_data[0];
+  const int pad_w = pad_data[1];
+  const int* stride_data = this->stride_.cpu_data();
+  const int stride_h = stride_data[0];
+  const int stride_w = stride_data[1];
+
+  // Specify workspace limit for kernels directly until we have a
+  // planning strategy and a rewrite of Caffe's GPU memory mangagement
+  size_t workspace_limit_bytes = 8*1024*1024;
+
+  for (int i = 0; i < bottom.size(); i++) {
+    cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
+                                  this->num_,
+                                  this->channels_ / this->group_,
+                                  height,
+                                  width,
+                                  this->channels_ * height * width,
+                                  height * width,
+                                  width,
+                                  1);
+    cudnn::setTensor4dDesc<Dtype>(&top_descs_[i],
+                                  this->num_,
+                                  this->num_output_ / this->group_,
+                                  height_out,
+                                  width_out,
+                                  this->num_output_ * height_out * width_out,
+                                  height_out * width_out,
+                                  width_out,
+                                  1);
+    cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i],
+                                     top_descs_[i],
+                                     filter_desc_,
+                                     pad_h,
+                                     pad_w,
+                                     stride_h,
+                                     stride_w);
+
+    // choose forward and backward algorithms + workspace(s)
+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(
+        handle_[0],
+        top_descs_[i],
+        filter_desc_,
+        conv_descs_[i],
+        bottom_descs_[i],
+        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        workspace_limit_bytes,
+        &fwd_algo_[i]));
+
+    // We have found that CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is
+    // buggy. Thus, if this algo was chosen, choose winograd instead. If
+    // winograd is not supported or workspace is larger than threshold, choose
+    // implicit_gemm instead.
+    if (fwd_algo_[i] == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) {
+      size_t winograd_workspace_size;
+      cudnnStatus_t status = cudnnGetConvolutionForwardWorkspaceSize(
+          handle_[0],
+          top_descs_[i],
+          filter_desc_,
+          conv_descs_[i],
+          bottom_descs_[i],
+          CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
+          &winograd_workspace_size);
+      if (status != CUDNN_STATUS_SUCCESS ||
+          winograd_workspace_size >= workspace_limit_bytes) {
+        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+      } else {
+        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
+      }
+    }
+
+    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
+        handle_[0],
+        top_descs_[i],
+        filter_desc_,
+        conv_descs_[i],
+        bottom_descs_[i],
+        fwd_algo_[i],
+        &(workspace_fwd_sizes_[i])));
+
+    // choose backward algorithm for filter
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(
+        handle_[0],
+        top_descs_[i],
+        bottom_descs_[i],
+        conv_descs_[i],
+        filter_desc_,
+        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+        workspace_limit_bytes,
+        &bwd_filter_algo_[i]));
+
+    // get workspace for backwards filter algorithm
+    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+        handle_[0],
+        top_descs_[i],
+        bottom_descs_[i],
+        conv_descs_[i],
+        filter_desc_,
+        bwd_filter_algo_[i],
+        &workspace_bwd_filter_sizes_[i]));
+
+    // choose backward algo for data
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(
+        handle_[0],
+        filter_desc_,
+        bottom_descs_[i],
+        conv_descs_[i],
+        top_descs_[i],
+        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+        workspace_limit_bytes,
+        &bwd_data_algo_[i]));
+
+    // get workspace size
+    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
+        handle_[0],
+        filter_desc_,
+        bottom_descs_[i],
+        conv_descs_[i],
+        top_descs_[i],
+        bwd_data_algo_[i],
+        &workspace_bwd_data_sizes_[i]));
+  }
+
+  // reduce over all workspace sizes to get a maximum to allocate / reallocate
+  size_t total_workspace_fwd = 0;
+  size_t total_workspace_bwd_data = 0;
+  size_t total_workspace_bwd_filter = 0;
+
+  for (size_t i = 0; i < bottom.size(); i++) {
+    total_workspace_fwd        = std::max(total_workspace_fwd,
+                                     workspace_fwd_sizes_[i]);
+    total_workspace_bwd_data   = std::max(total_workspace_bwd_data,
+                                     workspace_bwd_data_sizes_[i]);
+    total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
+                                     workspace_bwd_filter_sizes_[i]);
+  }
+  // get max over all operations
+  size_t max_workspace = std::max(total_workspace_fwd,
+                             total_workspace_bwd_data);
+  max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
+  // ensure all groups have enough workspace
+  size_t total_max_workspace = max_workspace *
+                               (this->group_ * CUDNN_STREAMS_PER_GROUP);
+
+  // this is the total amount of storage needed over all groups + streams
+  if (total_max_workspace > workspaceSizeInBytes) {
+    DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
+    workspaceSizeInBytes = total_max_workspace;
+
+    // free the existing workspace and allocate a new (larger) one
+    cudaFree(this->workspaceData);
+
+    cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
+    if (err != cudaSuccess) {
+      // force zero memory path
+      for (int i = 0; i < bottom.size(); i++) {
+        workspace_fwd_sizes_[i] = 0;
+        workspace_bwd_filter_sizes_[i] = 0;
+        workspace_bwd_data_sizes_[i] = 0;
+        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING;
+        bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+        bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+      }
+
+      // NULL out all workspace pointers
+      for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
+        workspace[g] = NULL;
+      }
+      // NULL out underlying data
+      workspaceData = NULL;
+      workspaceSizeInBytes = 0;
+    }
+
+    // if we succeed in the allocation, set pointer aliases for workspaces
+    for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
+      workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace;
+    }
+  }
+
+  // Tensor descriptor for bias.
+  if (this->bias_term_) {
+    cudnn::setTensor4dDesc<Dtype>(
+        &bias_desc_, 1, this->num_output_ / this->group_, 1, 1);
+  }
+}
+
+template <typename Dtype>
+CuDNNDeconvolutionLayer<Dtype>::~CuDNNDeconvolutionLayer() {
+  // Check that handles have been setup before destroying.
+  if (!handles_setup_) { return; }
+
+  for (int i = 0; i < bottom_descs_.size(); i++) {
+    cudnnDestroyTensorDescriptor(bottom_descs_[i]);
+    cudnnDestroyTensorDescriptor(top_descs_[i]);
+    cudnnDestroyConvolutionDescriptor(conv_descs_[i]);
+  }
+  if (this->bias_term_) {
+    cudnnDestroyTensorDescriptor(bias_desc_);
+  }
+  cudnnDestroyFilterDescriptor(filter_desc_);
+
+  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
+    cudaStreamDestroy(stream_[g]);
+    cudnnDestroy(handle_[g]);
+  }
+
+  cudaFree(workspaceData);
+  delete [] workspace;
+  delete [] stream_;
+  delete [] handle_;
+  delete [] fwd_algo_;
+  delete [] bwd_filter_algo_;
+  delete [] bwd_data_algo_;
+  delete [] workspace_fwd_sizes_;
+  delete [] workspace_bwd_data_sizes_;
+  delete [] workspace_bwd_filter_sizes_;
+}
+
+INSTANTIATE_CLASS(CuDNNDeconvolutionLayer);
+
+}   // namespace caffe
+#endif
diff --git a/src/caffe/layers/cudnn_deconv_layer.cu b/src/caffe/layers/cudnn_deconv_layer.cu
new file mode 100644
index 00000000000..eb1df32918f
--- /dev/null
+++ b/src/caffe/layers/cudnn_deconv_layer.cu
@@ -0,0 +1,138 @@
+#ifdef USE_CUDNN
+#include <vector>
+
+#include "caffe/layers/cudnn_deconv_layer.hpp"
+
+namespace caffe {
+
+__global__ void sync_deconv_groups() {}
+
+template <typename Dtype>
+void CuDNNDeconvolutionLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  const Dtype* weight = this->blobs_[0]->gpu_data();
+  for (int i = 0; i < bottom.size(); ++i) {
+    const Dtype* bottom_data = bottom[i]->gpu_data();
+    Dtype* top_data = top[i]->mutable_gpu_data();
+
+    // Forward through cuDNN in parallel over groups.
+    for (int g = 0; g < this->group_; g++) {
+      // Filters.
+      CUDNN_CHECK(cudnnConvolutionBackwardData(
+          handle_[g],
+          cudnn::dataType<Dtype>::one,
+          filter_desc_,
+          weight + this->weight_offset_ * g,
+          bottom_descs_[i],
+          bottom_data + bottom_offset_ * g,
+          conv_descs_[i],
+          bwd_data_algo_[i],
+          workspace[g],
+          workspace_bwd_data_sizes_[i],
+          cudnn::dataType<Dtype>::zero,
+          top_descs_[i],
+          top_data + top_offset_ * g));
+
+      // Bias.
+      if (this->bias_term_) {
+        const Dtype* bias_data = this->blobs_[1]->gpu_data();
+        CUDNN_CHECK(cudnnAddTensor(handle_[g],
+                                   cudnn::dataType<Dtype>::one,
+                                   bias_desc_,
+                                   bias_data + bias_offset_ * g,
+                                   cudnn::dataType<Dtype>::one,
+                                   top_descs_[i],
+                                   top_data + top_offset_ * g));
+      }
+    }
+
+    // Synchronize the work across groups, each of which went into its own
+    // stream, by launching an empty kernel into the default (null) stream.
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    sync_deconv_groups<<<1, 1>>>();
+  }
+}
+
+template <typename Dtype>
+void CuDNNDeconvolutionLayer<Dtype>::Backward_gpu(
+    const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  const Dtype* weight = NULL;
+  Dtype* weight_diff = NULL;
+  if (this->param_propagate_down_[0]) {
+    weight = this->blobs_[0]->gpu_data();
+    weight_diff = this->blobs_[0]->mutable_gpu_diff();
+  }
+  Dtype* bias_diff = NULL;
+  if (this->bias_term_ && this->param_propagate_down_[1]) {
+    bias_diff = this->blobs_[1]->mutable_gpu_diff();
+  }
+  for (int i = 0; i < top.size(); ++i) {
+    const Dtype* top_diff = top[i]->gpu_diff();
+    // Backward through cuDNN in parallel over groups and gradients.
+    for (int g = 0; g < this->group_; g++) {
+      // Gradient w.r.t. bias.
+      if (this->bias_term_ && this->param_propagate_down_[1]) {
+        CUDNN_CHECK(cudnnConvolutionBackwardBias(handle_[0 * this->group_ + g],
+                                                 cudnn::dataType<Dtype>::one,
+                                                 top_descs_[i],
+                                                 top_diff + top_offset_ * g,
+                                                 cudnn::dataType<Dtype>::one,
+                                                 bias_desc_,
+                                                 bias_diff + bias_offset_ * g));
+      }
+
+      // Gradient w.r.t. weights.
+      if (this->param_propagate_down_[0]) {
+        const Dtype* bottom_data = bottom[i]->gpu_data();
+        CUDNN_CHECK(cudnnConvolutionBackwardFilter(
+            handle_[1 * this->group_ + g],
+            cudnn::dataType<Dtype>::one,
+            top_descs_[i],
+            top_diff + top_offset_ * g,
+            bottom_descs_[i],
+            bottom_data + bottom_offset_ * g,
+            conv_descs_[i],
+            bwd_filter_algo_[i],
+            workspace[1 * this->group_ + g],
+            workspace_bwd_filter_sizes_[i],
+            cudnn::dataType<Dtype>::one,
+            filter_desc_,
+            weight_diff + this->weight_offset_ * g));
+      }
+
+      // Gradient w.r.t. bottom data.
+      if (propagate_down[i]) {
+        if (weight == NULL) {
+          weight = this->blobs_[0]->gpu_data();
+        }
+        Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
+        CUDNN_CHECK(
+            cudnnConvolutionForward(handle_[2 * this->group_ + g],
+                                    cudnn::dataType<Dtype>::one,
+                                    top_descs_[i],
+                                    top_diff + top_offset_ * g,
+                                    filter_desc_,
+                                    weight + this->weight_offset_ * g,
+                                    conv_descs_[i],
+                                    fwd_algo_[i],
+                                    workspace[2 * this->group_ + g],
+                                    workspace_fwd_sizes_[i],
+                                    cudnn::dataType<Dtype>::zero,
+                                    bottom_descs_[i],
+                                    bottom_diff + bottom_offset_ * g));
+      }
+    }
+
+    // Synchronize the work across groups, each of which went into its own
+    // stream, by launching an empty kernel into the default (null) stream.
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    sync_deconv_groups<<<1, 1>>>();
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(CuDNNDeconvolutionLayer);
+
+}  // namespace caffe
+#endif
diff --git a/src/caffe/layers/cudnn_relu_layer.cpp b/src/caffe/layers/cudnn_relu_layer.cpp
index 795e0a9efb0..687c905763e 100644
--- a/src/caffe/layers/cudnn_relu_layer.cpp
+++ b/src/caffe/layers/cudnn_relu_layer.cpp
@@ -36,6 +36,7 @@ CuDNNReLULayer<Dtype>::~CuDNNReLULayer() {
 
   cudnnDestroyTensorDescriptor(this->bottom_desc_);
   cudnnDestroyTensorDescriptor(this->top_desc_);
+  cudnnDestroyActivationDescriptor(this->activ_desc_);
   cudnnDestroy(this->handle_);
 }
 
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index 20a460fbdea..b86472b34be 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -79,6 +79,5 @@ STUB_GPU(DeconvolutionLayer);
 #endif
 
 INSTANTIATE_CLASS(DeconvolutionLayer);
-REGISTER_LAYER_CLASS(Deconvolution);
 
 }  // namespace caffe
diff --git a/src/caffe/layers/eltwise_layer.cpp b/src/caffe/layers/eltwise_layer.cpp
index 21256166bfa..3d82b0e1cbf 100644
--- a/src/caffe/layers/eltwise_layer.cpp
+++ b/src/caffe/layers/eltwise_layer.cpp
@@ -31,7 +31,9 @@ template <typename Dtype>
 void EltwiseLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
   for (int i = 1; i < bottom.size(); ++i) {
-    CHECK(bottom[i]->shape() == bottom[0]->shape());
+    CHECK(bottom[0]->shape() == bottom[i]->shape())
+        << "bottom[0]: " << bottom[0]->shape_string()
+        << ", bottom[" << i << "]: " << bottom[i]->shape_string();
   }
   top[0]->ReshapeLike(*bottom[0]);
   // If max operation, we will initialize the vector index part.
diff --git a/src/caffe/layers/embed_layer.cu b/src/caffe/layers/embed_layer.cu
index 6324a3a8937..3cf39fd9983 100644
--- a/src/caffe/layers/embed_layer.cu
+++ b/src/caffe/layers/embed_layer.cu
@@ -15,6 +15,11 @@ __global__ void EmbedForward(const int nthreads, const Dtype* bottom_data,
     const int n = top_index / N;
     const int d = top_index % N;
     const int index = static_cast<int>(bottom_data[n]);
+    #ifdef DEBUG
+        assert(index >= 0);
+        assert(index < K);
+        assert(static_cast<Dtype>(index) == bottom_data[n]);
+    #endif
     const int weight_index = index * N + d;
     top_data[top_index] = weight[weight_index];
   }
diff --git a/src/caffe/layers/hdf5_data_layer.cpp b/src/caffe/layers/hdf5_data_layer.cpp
index 00716a92b15..7668854cc1f 100644
--- a/src/caffe/layers/hdf5_data_layer.cpp
+++ b/src/caffe/layers/hdf5_data_layer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
 /*
 TODO:
 - load file in a separate thread ("prefetch")
@@ -184,3 +185,4 @@ INSTANTIATE_CLASS(HDF5DataLayer);
 REGISTER_LAYER_CLASS(HDF5Data);
 
 }  // namespace caffe
+#endif  // USE_HDF5
diff --git a/src/caffe/layers/hdf5_data_layer.cu b/src/caffe/layers/hdf5_data_layer.cu
index 33eebd41dfc..70cd9f32f85 100644
--- a/src/caffe/layers/hdf5_data_layer.cu
+++ b/src/caffe/layers/hdf5_data_layer.cu
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
 /*
 TODO:
 - only load parts of the file, in accordance with a prototxt param "max_mem"
@@ -34,3 +35,4 @@ void HDF5DataLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 INSTANTIATE_LAYER_GPU_FUNCS(HDF5DataLayer);
 
 }  // namespace caffe
+#endif  // USE_HDF5
diff --git a/src/caffe/layers/hdf5_output_layer.cpp b/src/caffe/layers/hdf5_output_layer.cpp
index f8f1edcd18e..28c453a20fd 100644
--- a/src/caffe/layers/hdf5_output_layer.cpp
+++ b/src/caffe/layers/hdf5_output_layer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
 #include <vector>
 
 #include "hdf5.h"
@@ -72,3 +73,4 @@ INSTANTIATE_CLASS(HDF5OutputLayer);
 REGISTER_LAYER_CLASS(HDF5Output);
 
 }  // namespace caffe
+#endif  // USE_HDF5
diff --git a/src/caffe/layers/hdf5_output_layer.cu b/src/caffe/layers/hdf5_output_layer.cu
index c1685cd34a7..891aea03862 100644
--- a/src/caffe/layers/hdf5_output_layer.cu
+++ b/src/caffe/layers/hdf5_output_layer.cu
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
 #include <vector>
 
 #include "hdf5.h"
@@ -37,3 +38,4 @@ void HDF5OutputLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
 INSTANTIATE_LAYER_GPU_FUNCS(HDF5OutputLayer);
 
 }  // namespace caffe
+#endif  // USE_HDF5
diff --git a/src/caffe/layers/infogain_loss_layer.cpp b/src/caffe/layers/infogain_loss_layer.cpp
index 624d3118124..3c3f460ec34 100644
--- a/src/caffe/layers/infogain_loss_layer.cpp
+++ b/src/caffe/layers/infogain_loss_layer.cpp
@@ -3,7 +3,8 @@
 #include <vector>
 
 #include "caffe/layers/infogain_loss_layer.hpp"
-#include "caffe/util/io.hpp"
+#include "caffe/util/io.hpp"  // for bolb reading of matrix H
+#include "caffe/util/math_functions.hpp"
 
 namespace caffe {
 
@@ -11,6 +12,31 @@ template <typename Dtype>
 void InfogainLossLayer<Dtype>::LayerSetUp(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   LossLayer<Dtype>::LayerSetUp(bottom, top);
+  // internal softmax layer
+  LayerParameter softmax_layer_param(this->layer_param_);
+  SoftmaxParameter* softmax_param = softmax_layer_param.mutable_softmax_param();
+  softmax_param->set_axis(this->layer_param_.infogain_loss_param().axis());
+  softmax_layer_param.set_type("Softmax");
+  softmax_layer_param.clear_loss_weight();
+  softmax_layer_param.add_loss_weight(1);
+  softmax_layer_ = LayerRegistry<Dtype>::CreateLayer(softmax_layer_param);
+  softmax_bottom_vec_.clear();
+  softmax_bottom_vec_.push_back(bottom[0]);
+  softmax_top_vec_.clear();
+  softmax_top_vec_.push_back(&prob_);
+  softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_);
+
+  // ignore label
+  has_ignore_label_ =
+    this->layer_param_.loss_param().has_ignore_label();
+  if (has_ignore_label_) {
+    ignore_label_ = this->layer_param_.loss_param().ignore_label();
+  }
+  // normalization
+  CHECK(!this->layer_param_.loss_param().has_normalize())
+    << "normalize is deprecated. use \"normalization\"";
+  normalization_ = this->layer_param_.loss_param().normalization();
+  // matrix H
   if (bottom.size() < 3) {
     CHECK(this->layer_param_.infogain_loss_param().has_source())
         << "Infogain matrix source must be specified.";
@@ -25,28 +51,86 @@ template <typename Dtype>
 void InfogainLossLayer<Dtype>::Reshape(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
   LossLayer<Dtype>::Reshape(bottom, top);
+  softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_);
+  infogain_axis_ =
+    bottom[0]->CanonicalAxisIndex(
+      this->layer_param_.infogain_loss_param().axis());
+  outer_num_ = bottom[0]->count(0, infogain_axis_);
+  inner_num_ = bottom[0]->count(infogain_axis_ + 1);
+  CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
+      << "Number of labels must match number of predictions; "
+      << "e.g., if infogain axis == 1 and prediction shape is (N, C, H, W), "
+      << "label count (number of labels) must be N*H*W, "
+      << "with integer values in {0, 1, ..., C-1}.";
+  num_labels_ = bottom[0]->shape(infogain_axis_);
   Blob<Dtype>* infogain = NULL;
   if (bottom.size() < 3) {
     infogain = &infogain_;
   } else {
     infogain = bottom[2];
   }
-  CHECK_EQ(bottom[1]->channels(), 1);
-  CHECK_EQ(bottom[1]->height(), 1);
-  CHECK_EQ(bottom[1]->width(), 1);
-  const int num = bottom[0]->num();
-  const int dim = bottom[0]->count() / num;
-  CHECK_EQ(infogain->num(), 1);
-  CHECK_EQ(infogain->channels(), 1);
-  CHECK_EQ(infogain->height(), dim);
-  CHECK_EQ(infogain->width(), dim);
+  CHECK_EQ(infogain->count(), num_labels_*num_labels_);
+  sum_rows_H_.Reshape(vector<int>(1, num_labels_));
+  if (bottom.size() == 2) {
+    // H is provided as a parameter and will not change. sum rows once
+    sum_rows_of_H(infogain);
+  }
+  if (top.size() >= 2) {
+    // softmax output
+    top[1]->ReshapeLike(*bottom[0]);
+  }
+}
+
+template <typename Dtype>
+Dtype InfogainLossLayer<Dtype>::get_normalizer(
+    LossParameter_NormalizationMode normalization_mode, int valid_count) {
+  Dtype normalizer;
+  switch (normalization_mode) {
+    case LossParameter_NormalizationMode_FULL:
+      normalizer = Dtype(outer_num_ * inner_num_);
+      break;
+    case LossParameter_NormalizationMode_VALID:
+      if (valid_count == -1) {
+        normalizer = Dtype(outer_num_ * inner_num_);
+      } else {
+        normalizer = Dtype(valid_count);
+      }
+      break;
+    case LossParameter_NormalizationMode_BATCH_SIZE:
+      normalizer = Dtype(outer_num_);
+      break;
+    case LossParameter_NormalizationMode_NONE:
+      normalizer = Dtype(1);
+      break;
+    default:
+      LOG(FATAL) << "Unknown normalization mode: "
+          << LossParameter_NormalizationMode_Name(normalization_mode);
+  }
+  // Some users will have no labels for some examples in order to 'turn off' a
+  // particular loss in a multi-task setup. The max prevents NaNs in that case.
+  return std::max(Dtype(1.0), normalizer);
 }
 
+template <typename Dtype>
+void InfogainLossLayer<Dtype>::sum_rows_of_H(const Blob<Dtype>* H) {
+  CHECK_EQ(H->count(), num_labels_*num_labels_)
+    << "H must be " << num_labels_ << "x" << num_labels_;
+  const Dtype* infogain_mat = H->cpu_data();
+  Dtype* sum = sum_rows_H_.mutable_cpu_data();
+  for ( int row = 0; row < num_labels_ ; row++ ) {
+    sum[row] = 0;
+    for ( int col = 0; col < num_labels_ ; col++ ) {
+      sum[row] += infogain_mat[row*num_labels_+col];
+    }
+  }
+}
 
 template <typename Dtype>
 void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  const Dtype* bottom_data = bottom[0]->cpu_data();
+  // The forward pass computes the softmax prob values.
+  softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
+  const Dtype* prob_data = prob_.cpu_data();
   const Dtype* bottom_label = bottom[1]->cpu_data();
   const Dtype* infogain_mat = NULL;
   if (bottom.size() < 3) {
@@ -54,17 +138,30 @@ void InfogainLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   } else {
     infogain_mat = bottom[2]->cpu_data();
   }
-  int num = bottom[0]->num();
-  int dim = bottom[0]->count() / bottom[0]->num();
+  int count = 0;
   Dtype loss = 0;
-  for (int i = 0; i < num; ++i) {
-    int label = static_cast<int>(bottom_label[i]);
-    for (int j = 0; j < dim; ++j) {
-      Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
-      loss -= infogain_mat[label * dim + j] * log(prob);
+  for (int i = 0; i < outer_num_; ++i) {
+    for (int j = 0; j < inner_num_; j++) {
+      const int label_value =
+        static_cast<int>(bottom_label[i * inner_num_ + j]);
+      if (has_ignore_label_ && label_value == ignore_label_) {
+        continue;
+      }
+      DCHECK_GE(label_value, 0);
+      DCHECK_LT(label_value, num_labels_);
+      for (int l = 0; l < num_labels_; l++) {
+        loss -= infogain_mat[label_value * num_labels_ + l] *
+          log(std::max(
+                prob_data[i * inner_num_*num_labels_ + l * inner_num_ + j],
+                Dtype(kLOG_THRESHOLD)));
+      }
+      ++count;
     }
   }
-  top[0]->mutable_cpu_data()[0] = loss / num;
+  top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count);
+  if (top.size() == 2) {
+    top[1]->ShareData(prob_);
+  }
 }
 
 template <typename Dtype>
@@ -80,25 +177,44 @@ void InfogainLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
                << " Layer cannot backpropagate to infogain inputs.";
   }
   if (propagate_down[0]) {
-    const Dtype* bottom_data = bottom[0]->cpu_data();
+    const Dtype* prob_data = prob_.cpu_data();
     const Dtype* bottom_label = bottom[1]->cpu_data();
     const Dtype* infogain_mat = NULL;
     if (bottom.size() < 3) {
       infogain_mat = infogain_.cpu_data();
     } else {
       infogain_mat = bottom[2]->cpu_data();
+      // H is provided as a "bottom" and might change. sum rows every time.
+      sum_rows_of_H(bottom[2]);
     }
+    const Dtype* sum_rows_H = sum_rows_H_.cpu_data();
     Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
-    int num = bottom[0]->num();
-    int dim = bottom[0]->count() / bottom[0]->num();
-    const Dtype scale = - top[0]->cpu_diff()[0] / num;
-    for (int i = 0; i < num; ++i) {
-      const int label = static_cast<int>(bottom_label[i]);
-      for (int j = 0; j < dim; ++j) {
-        Dtype prob = std::max(bottom_data[i * dim + j], Dtype(kLOG_THRESHOLD));
-        bottom_diff[i * dim + j] = scale * infogain_mat[label * dim + j] / prob;
+    const int dim = bottom[0]->count() / outer_num_;
+    int count = 0;
+    for (int i = 0; i < outer_num_; ++i) {
+      for (int j = 0; j < inner_num_; ++j) {
+        const int label_value =
+          static_cast<int>(bottom_label[i * inner_num_ + j]);
+        DCHECK_GE(label_value, 0);
+        DCHECK_LT(label_value, num_labels_);
+        if (has_ignore_label_ && label_value == ignore_label_) {
+          for (int l = 0; l < num_labels_; ++l) {
+            bottom_diff[i * dim + l * inner_num_ + j] = 0;
+          }
+        } else {
+          for (int l = 0; l < num_labels_; ++l) {
+            bottom_diff[i * dim + l * inner_num_ + j] =
+               prob_data[i*dim + l*inner_num_ + j]*sum_rows_H[label_value]
+               - infogain_mat[label_value * num_labels_ + l];
+          }
+          ++count;
+        }
       }
     }
+    // Scale gradient
+    Dtype loss_weight = top[0]->cpu_diff()[0] /
+                        get_normalizer(normalization_, count);
+    caffe_scal(bottom[0]->count(), loss_weight, bottom_diff);
   }
 }
 
diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index e65349f0055..57fdbe1fac2 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -42,7 +42,7 @@ void InnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>(
         this->layer_param_.inner_product_param().weight_filler()));
     weight_filler->Fill(this->blobs_[0].get());
-    // If necessary, intiialize and fill the bias term
+    // If necessary, initialize and fill the bias term
     if (bias_term_) {
       vector<int> bias_shape(1, N_);
       this->blobs_[1].reset(new Blob<Dtype>(bias_shape));
diff --git a/src/caffe/layers/lstm_unit_layer.cpp b/src/caffe/layers/lstm_unit_layer.cpp
index 277c031ad15..d1ab59c4bd1 100644
--- a/src/caffe/layers/lstm_unit_layer.cpp
+++ b/src/caffe/layers/lstm_unit_layer.cpp
@@ -31,7 +31,6 @@ void LSTMUnitLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     CHECK_EQ(num_instances, bottom[i]->shape(1));
   }
   hidden_dim_ = bottom[0]->shape(2);
-  CHECK_EQ(num_instances, bottom[1]->shape(1));
   CHECK_EQ(4 * hidden_dim_, bottom[1]->shape(2));
   top[0]->ReshapeLike(*bottom[0]);
   top[1]->ReshapeLike(*bottom[0]);
diff --git a/src/caffe/layers/pooling_layer.cpp b/src/caffe/layers/pooling_layer.cpp
index 90897db0f45..32dc04824d8 100644
--- a/src/caffe/layers/pooling_layer.cpp
+++ b/src/caffe/layers/pooling_layer.cpp
@@ -35,6 +35,7 @@ void PoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       || (!pool_param.has_stride_h() && !pool_param.has_stride_w()))
       << "Stride is stride OR stride_h and stride_w are required.";
   global_pooling_ = pool_param.global_pooling();
+  round_mode_ = pool_param.round_mode();
   if (global_pooling_) {
     kernel_h_ = bottom[0]->height();
     kernel_w_ = bottom[0]->width();
@@ -87,10 +88,22 @@ void PoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     kernel_h_ = bottom[0]->height();
     kernel_w_ = bottom[0]->width();
   }
-  pooled_height_ = static_cast<int>(ceil(static_cast<float>(
-      height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
-  pooled_width_ = static_cast<int>(ceil(static_cast<float>(
-      width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
+  switch (round_mode_) {
+  case PoolingParameter_RoundMode_CEIL:
+    pooled_height_ = static_cast<int>(ceil(static_cast<float>(
+        height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
+    pooled_width_ = static_cast<int>(ceil(static_cast<float>(
+        width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
+    break;
+  case PoolingParameter_RoundMode_FLOOR:
+    pooled_height_ = static_cast<int>(floor(static_cast<float>(
+        height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
+    pooled_width_ = static_cast<int>(floor(static_cast<float>(
+        width_ + 2 * pad_w_ - kernel_w_) / stride_w_)) + 1;
+    break;
+  default:
+    LOG(FATAL) << "Unknown rounding mode.";
+  }
   if (pad_h_ || pad_w_) {
     // If we have padding, ensure that the last pooling starts strictly
     // inside the image (instead of at the padding); otherwise clip the last.
@@ -132,7 +145,7 @@ void PoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   const int top_count = top[0]->count();
   // We'll output the mask to top[1] if it's of size >1.
   const bool use_top_mask = top.size() > 1;
-  int* mask = NULL;  // suppress warnings about uninitalized variables
+  int* mask = NULL;  // suppress warnings about uninitialized variables
   Dtype* top_mask = NULL;
   // Different pooling methods. We explicitly do the switch outside the for
   // loop to save time, although this results in more code.
diff --git a/src/caffe/layers/pooling_layer.cu b/src/caffe/layers/pooling_layer.cu
index 1ea46cc81b1..46eddb94924 100644
--- a/src/caffe/layers/pooling_layer.cu
+++ b/src/caffe/layers/pooling_layer.cu
@@ -138,7 +138,7 @@ __global__ void StoPoolForwardTest(const int nthreads,
     const int wstart = pw * stride_w;
     const int wend = min(wstart + kernel_w, width);
     // We set cumsum to be 0 to avoid divide-by-zero problems
-    Dtype cumsum = FLT_MIN;
+    Dtype cumsum = 0.;
     Dtype cumvalues = 0.;
     const Dtype* const bottom_slice =
         bottom_data + (n * channels + c) * height * width;
@@ -149,7 +149,7 @@ __global__ void StoPoolForwardTest(const int nthreads,
         cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
       }
     }
-    top_data[index] = cumvalues / cumsum;
+    top_data[index] = (cumsum > 0.) ? cumvalues / cumsum : 0.;
   }
 }
 
diff --git a/src/caffe/layers/recurrent_layer.cpp b/src/caffe/layers/recurrent_layer.cpp
index e0c82773392..9cd3206f66a 100644
--- a/src/caffe/layers/recurrent_layer.cpp
+++ b/src/caffe/layers/recurrent_layer.cpp
@@ -214,8 +214,9 @@ void RecurrentLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     const int bottom_offset = 2 + static_input_;
     for (int i = bottom_offset, j = 0; i < bottom.size(); ++i, ++j) {
       CHECK(recur_input_blobs_[j]->shape() == bottom[i]->shape())
-          << "bottom[" << i << "] shape must match hidden state input shape: "
-          << recur_input_blobs_[j]->shape_string();
+          << "shape mismatch - recur_input_blobs_[" << j << "]: "
+          << recur_input_blobs_[j]->shape_string()
+          << " vs. bottom[" << i << "]: " << bottom[i]->shape_string();
       recur_input_blobs_[j]->ShareData(*bottom[i]);
     }
   }
diff --git a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
index b9877e6a3f6..7497e4aa47d 100644
--- a/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
+++ b/src/caffe/layers/sigmoid_cross_entropy_loss_layer.cu
@@ -48,9 +48,8 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
   // Stable version of loss computation from input data
   const Dtype* input_data = bottom[0]->gpu_data();
   const Dtype* target = bottom[1]->gpu_data();
-  // Since this memory is not used for anything until it is overwritten
-  // on the backward pass, we use it here to avoid having to allocate new GPU
-  // memory to accumulate intermediate results in the kernel.
+  // Since this memory is not used for anything, we use it here to avoid having
+  // to allocate new GPU memory to accumulate intermediate results.
   Dtype* loss_data = bottom[0]->mutable_gpu_diff();
   Dtype* count_data = bottom[1]->mutable_gpu_diff();
   Dtype valid_count;
@@ -69,6 +68,10 @@ void SigmoidCrossEntropyLossLayer<Dtype>::Forward_gpu(
   caffe_gpu_asum(count, loss_data, &loss);
   normalizer_ = get_normalizer(normalization_, valid_count);
   top[0]->mutable_cpu_data()[0] = loss / normalizer_;
+
+  // Clear scratch memory to prevent interfering with backward (see #6202).
+  caffe_gpu_set(bottom[0]->count(), Dtype(0), bottom[0]->mutable_gpu_diff());
+  caffe_gpu_set(bottom[1]->count(), Dtype(0), bottom[1]->mutable_gpu_diff());
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/sigmoid_layer.cpp b/src/caffe/layers/sigmoid_layer.cpp
index 85fd9676812..f8aa769a174 100644
--- a/src/caffe/layers/sigmoid_layer.cpp
+++ b/src/caffe/layers/sigmoid_layer.cpp
@@ -7,7 +7,7 @@ namespace caffe {
 
 template <typename Dtype>
 inline Dtype sigmoid(Dtype x) {
-  return 1. / (1. + exp(-x));
+  return 0.5 * tanh(0.5 * x) + 0.5;
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/sigmoid_layer.cu b/src/caffe/layers/sigmoid_layer.cu
index 184c61ede83..8a4ea6616e0 100644
--- a/src/caffe/layers/sigmoid_layer.cu
+++ b/src/caffe/layers/sigmoid_layer.cu
@@ -8,7 +8,7 @@ namespace caffe {
 template <typename Dtype>
 __global__ void SigmoidForward(const int n, const Dtype* in, Dtype* out) {
   CUDA_KERNEL_LOOP(index, n) {
-    out[index] = 1. / (1. + exp(-in[index]));
+    out[index] = 0.5 * tanh(0.5 * in[index]) + 0.5;
   }
 }
 
diff --git a/src/caffe/layers/slice_layer.cpp b/src/caffe/layers/slice_layer.cpp
index 759beafe0d9..64de0964483 100644
--- a/src/caffe/layers/slice_layer.cpp
+++ b/src/caffe/layers/slice_layer.cpp
@@ -41,7 +41,9 @@ void SliceLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   int count = 0;
   if (slice_point_.size() != 0) {
     CHECK_EQ(slice_point_.size(), top.size() - 1);
-    CHECK_LE(top.size(), bottom_slice_axis);
+    CHECK_LE(top.size(), bottom_slice_axis)
+        << "slice axis: " << slice_axis_
+        << ", bottom[0] shape: " << bottom[0]->shape_string();
     int prev = 0;
     vector<int> slices;
     for (int i = 0; i < slice_point_.size(); ++i) {
diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu
index 660e1b39fe0..b3c8ffa6b6c 100644
--- a/src/caffe/layers/softmax_loss_layer.cu
+++ b/src/caffe/layers/softmax_loss_layer.cu
@@ -36,9 +36,8 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
   const Dtype* label = bottom[1]->gpu_data();
   const int dim = prob_.count() / outer_num_;
   const int nthreads = outer_num_ * inner_num_;
-  // Since this memory is not used for anything until it is overwritten
-  // on the backward pass, we use it here to avoid having to allocate new GPU
-  // memory to accumulate intermediate results in the kernel.
+  // Since this memory is not used for anything, we use it here to avoid having
+  // to allocate new GPU memory to accumulate intermediate results.
   Dtype* loss_data = bottom[0]->mutable_gpu_diff();
   // Similarly, this memory is never used elsewhere, and thus we can use it
   // to avoid having to allocate additional GPU memory.
@@ -61,6 +60,9 @@ void SoftmaxWithLossLayer<Dtype>::Forward_gpu(
   if (top.size() == 2) {
     top[1]->ShareData(prob_);
   }
+
+  // Clear scratch memory to prevent interfering with backward (see #6202).
+  caffe_gpu_set(bottom[0]->count(), Dtype(0), bottom[0]->mutable_gpu_diff());
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/swish_layer.cpp b/src/caffe/layers/swish_layer.cpp
new file mode 100644
index 00000000000..28935679d00
--- /dev/null
+++ b/src/caffe/layers/swish_layer.cpp
@@ -0,0 +1,68 @@
+#include <cmath>
+#include <vector>
+
+#include "caffe/layers/swish_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void SwishLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  NeuronLayer<Dtype>::LayerSetUp(bottom, top);
+  sigmoid_bottom_vec_.clear();
+  sigmoid_bottom_vec_.push_back(sigmoid_input_.get());
+  sigmoid_top_vec_.clear();
+  sigmoid_top_vec_.push_back(sigmoid_output_.get());
+  sigmoid_layer_->SetUp(sigmoid_bottom_vec_, sigmoid_top_vec_);
+}
+
+template <typename Dtype>
+void SwishLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top) {
+  NeuronLayer<Dtype>::Reshape(bottom, top);
+  sigmoid_input_->ReshapeLike(*bottom[0]);
+  sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_);
+}
+
+template <typename Dtype>
+void SwishLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->cpu_data();
+  Dtype* sigmoid_input_data = sigmoid_input_->mutable_cpu_data();
+  Dtype* top_data = top[0]->mutable_cpu_data();
+  const int count = bottom[0]->count();
+  Dtype beta = this->layer_param_.swish_param().beta();
+  caffe_copy(count, bottom_data, sigmoid_input_data);
+  caffe_scal(count, beta, sigmoid_input_data);
+  sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
+  caffe_mul(count, bottom_data, sigmoid_output_->cpu_data(), top_data);
+}
+
+template <typename Dtype>
+void SwishLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_data = top[0]->cpu_data();
+    const Dtype* top_diff = top[0]->cpu_diff();
+    const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+    const int count = bottom[0]->count();
+    Dtype beta = this->layer_param_.swish_param().beta();
+    for (int i = 0; i < count; ++i) {
+      const Dtype swish_x = top_data[i];
+      bottom_diff[i] = top_diff[i] * (beta * swish_x + sigmoid_output_data[i]
+          * (1. - beta * swish_x));
+    }
+  }
+}
+
+#ifdef CPU_ONLY
+STUB_GPU(SwishLayer);
+#endif
+
+INSTANTIATE_CLASS(SwishLayer);
+REGISTER_LAYER_CLASS(Swish);
+
+}  // namespace caffe
diff --git a/src/caffe/layers/swish_layer.cu b/src/caffe/layers/swish_layer.cu
new file mode 100644
index 00000000000..c4fef53bf3a
--- /dev/null
+++ b/src/caffe/layers/swish_layer.cu
@@ -0,0 +1,54 @@
+#include <cmath>
+#include <vector>
+
+#include "caffe/layers/swish_layer.hpp"
+#include "caffe/util/math_functions.hpp"
+
+namespace caffe {
+
+template <typename Dtype>
+void SwishLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+  const Dtype* bottom_data = bottom[0]->gpu_data();
+  Dtype* sigmoid_input_data = sigmoid_input_->mutable_gpu_data();
+  Dtype* top_data = top[0]->mutable_gpu_data();
+  const int count = bottom[0]->count();
+  Dtype beta = this->layer_param_.swish_param().beta();
+  caffe_copy(count, bottom_data, sigmoid_input_data);
+  caffe_gpu_scal(count, beta, sigmoid_input_data);
+  sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
+  caffe_gpu_mul(count, bottom_data, sigmoid_output_->gpu_data(), top_data);
+}
+
+template <typename Dtype>
+__global__ void SwishBackward(const int n, const Dtype* in_diff,
+    const Dtype* out_data, const Dtype* sigmoid_output_data, Dtype* out_diff,
+    const Dtype beta) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const Dtype swish_x = out_data[index];
+    out_diff[index] = in_diff[index] * (beta * swish_x
+        + sigmoid_output_data[index] * (1 - beta * swish_x));
+  }
+}
+
+template <typename Dtype>
+void SwishLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
+    const vector<bool>& propagate_down,
+    const vector<Blob<Dtype>*>& bottom) {
+  if (propagate_down[0]) {
+    const Dtype* top_data = top[0]->gpu_data();
+    const Dtype* top_diff = top[0]->gpu_diff();
+    const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
+    Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
+    const int count = bottom[0]->count();
+    Dtype beta = this->layer_param_.swish_param().beta();
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    SwishBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
+        count, top_diff, top_data, sigmoid_output_data, bottom_diff, beta);
+    CUDA_POST_KERNEL_CHECK;
+  }
+}
+
+INSTANTIATE_LAYER_GPU_FUNCS(SwishLayer);
+
+}  // namespace caffe
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 70d51806d8a..5e844b03899 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -5,7 +5,9 @@
 #include <utility>
 #include <vector>
 
+#ifdef USE_HDF5
 #include "hdf5.h"
+#endif  // USE_HDF5
 
 #include "caffe/common.hpp"
 #include "caffe/layer.hpp"
@@ -164,7 +166,7 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   // loss.  We can skip backward computation for blobs that don't contribute
   // to the loss.
   // Also checks if all bottom blobs don't need backward computation (possible
-  // because the skip_propagate_down param) and so we can skip bacward
+  // because the skip_propagate_down param) and so we can skip backward
   // computation for the entire layer
   set<string> blobs_under_loss;
   set<string> blobs_skip_backp;
@@ -768,9 +770,8 @@ void Net<Dtype>::CopyTrainedLayersFrom(const NetParameter& param) {
 }
 
 template <typename Dtype>
-void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) {
-  if (trained_filename.size() >= 3 &&
-      trained_filename.compare(trained_filename.size() - 3, 3, ".h5") == 0) {
+void Net<Dtype>::CopyTrainedLayersFrom(const string& trained_filename) {
+  if (H5Fis_hdf5(trained_filename.c_str())) {
     CopyTrainedLayersFromHDF5(trained_filename);
   } else {
     CopyTrainedLayersFromBinaryProto(trained_filename);
@@ -779,14 +780,15 @@ void Net<Dtype>::CopyTrainedLayersFrom(const string trained_filename) {
 
 template <typename Dtype>
 void Net<Dtype>::CopyTrainedLayersFromBinaryProto(
-    const string trained_filename) {
+    const string& trained_filename) {
   NetParameter param;
   ReadNetParamsFromBinaryFileOrDie(trained_filename, &param);
   CopyTrainedLayersFrom(param);
 }
 
 template <typename Dtype>
-void Net<Dtype>::CopyTrainedLayersFromHDF5(const string trained_filename) {
+void Net<Dtype>::CopyTrainedLayersFromHDF5(const string& trained_filename) {
+#ifdef USE_HDF5
   hid_t file_hid = H5Fopen(trained_filename.c_str(), H5F_ACC_RDONLY,
                            H5P_DEFAULT);
   CHECK_GE(file_hid, 0) << "Couldn't open " << trained_filename;
@@ -833,6 +835,10 @@ void Net<Dtype>::CopyTrainedLayersFromHDF5(const string trained_filename) {
   }
   H5Gclose(data_hid);
   H5Fclose(file_hid);
+#else
+  LOG(FATAL) << "CopyTrainedLayersFromHDF5 requires hdf5;"
+             << " compile with USE_HDF5.";
+#endif  // USE_HDF5
 }
 
 template <typename Dtype>
@@ -849,6 +855,8 @@ void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) const {
 
 template <typename Dtype>
 void Net<Dtype>::ToHDF5(const string& filename, bool write_diff) const {
+// This code is taken from https://github.com/sh1r0/caffe-android-lib
+#ifdef USE_HDF5
   hid_t file_hid = H5Fcreate(filename.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT,
       H5P_DEFAULT);
   CHECK_GE(file_hid, 0)
@@ -902,6 +910,10 @@ void Net<Dtype>::ToHDF5(const string& filename, bool write_diff) const {
     H5Gclose(diff_hid);
   }
   H5Fclose(file_hid);
+// This code is taken from https://github.com/sh1r0/caffe-android-lib
+#else
+  LOG(FATAL) << "ToHDF5 requires hdf5; compile with USE_HDF5.";
+#endif  // USE_HDF5
 }
 
 template <typename Dtype>
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index a145c541957..3dcad697f6d 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -98,7 +98,7 @@ message NetParameter {
 // NOTE
 // Update the next available ID when you add a new SolverParameter field.
 //
-// SolverParameter next available ID: 42 (last added: layer_wise_reduce)
+// SolverParameter next available ID: 43 (last added: weights)
 message SolverParameter {
   //////////////////////////////////////////////////////////////////////////////
   // Specifying the train and test networks
@@ -186,7 +186,11 @@ message SolverParameter {
   optional float clip_gradients = 35 [default = -1];
 
   optional int32 snapshot = 14 [default = 0]; // The snapshot interval
-  optional string snapshot_prefix = 15; // The prefix for the snapshot.
+  // The prefix for the snapshot.
+  // If not set then is replaced by prototxt file path without extension.
+  // If is set to directory then is augmented by prototxt file name
+  // without extention.
+  optional string snapshot_prefix = 15;
   // whether to snapshot diff in the results or not. Snapshotting diff will help
   // debugging but the final protocol buffer size will be much larger.
   optional bool snapshot_diff = 16 [default = false];
@@ -241,6 +245,16 @@ message SolverParameter {
 
   // Overlap compute and communication for data parallel training
   optional bool layer_wise_reduce = 41 [default = true];
+
+  // Path to caffemodel file(s) with pretrained weights to initialize finetuning.
+  // Tha same as command line --weights parameter for caffe train command.
+  // If command line --weights parameter is specified, it has higher priority
+  // and overwrites this one(s).
+  // If --snapshot command line parameter is specified, this one(s) are ignored.
+  // If several model files are expected, they can be listed in a one 
+  // weights parameter separated by ',' (like in a command string) or
+  // in repeated weights parameters separately.
+  repeated string weights = 42;
 }
 
 // A message that stores the solver snapshots
@@ -308,7 +322,7 @@ message ParamSpec {
 // NOTE
 // Update the next available ID when you add a new LayerParameter field.
 //
-// LayerParameter next available layer-specific ID: 147 (last added: recurrent_param)
+// LayerParameter next available layer-specific ID: 149 (last added: clip_param)
 message LayerParameter {
   optional string name = 1; // the layer name
   optional string type = 2; // the layer type
@@ -364,6 +378,7 @@ message LayerParameter {
   optional ArgMaxParameter argmax_param = 103;
   optional BatchNormParameter batch_norm_param = 139;
   optional BiasParameter bias_param = 141;
+  optional ClipParameter clip_param = 148;
   optional ConcatParameter concat_param = 104;
   optional ContrastiveLossParameter contrastive_loss_param = 105;
   optional ConvolutionParameter convolution_param = 106;
@@ -401,6 +416,7 @@ message LayerParameter {
   optional SoftmaxParameter softmax_param = 125;
   optional SPPParameter spp_param = 132;
   optional SliceParameter slice_param = 126;
+  optional SwishParameter swish_param = 147;
   optional TanHParameter tanh_param = 127;
   optional ThresholdParameter threshold_param = 128;
   optional TileParameter tile_param = 138;
@@ -490,6 +506,12 @@ message ArgMaxParameter {
   optional int32 axis = 3;
 }
 
+// Message that stores parameters used by ClipLayer
+message ClipParameter {
+  required float min = 1;
+  required float max = 2;
+}
+
 message ConcatParameter {
   // The axis along which to concatenate -- may be negative to index from the
   // end (e.g., -1 for the last axis).  Other axes must have the
@@ -502,11 +524,21 @@ message ConcatParameter {
 }
 
 message BatchNormParameter {
-  // If false, accumulate global mean/variance values via a moving average. If
-  // true, use those accumulated values instead of computing mean/variance
-  // across the batch.
+  // If false, normalization is performed over the current mini-batch
+  // and global statistics are accumulated (but not yet used) by a moving
+  // average.
+  // If true, those accumulated mean and variance values are used for the
+  // normalization.
+  // By default, it is set to false when the network is in the training
+  // phase and true when the network is in the testing phase.
   optional bool use_global_stats = 1;
-  // How much does the moving average decay each iteration?
+  // What fraction of the moving average remains each iteration?
+  // Smaller values make the moving average decay faster, giving more
+  // weight to the recent values.
+  // Each iteration updates the moving average @f$S_{t-1}@f$ with the
+  // current mean @f$ Y_t @f$ by
+  // @f$ S_t = (1-\beta)Y_t + \beta \cdot S_{t-1} @f$, where @f$ \beta @f$
+  // is the moving_average_fraction parameter.
   optional float moving_average_fraction = 2 [default = .999];
   // Small value to add to the variance estimate so that we don't divide by
   // zero.
@@ -805,6 +837,7 @@ message ImageDataParameter {
 message InfogainLossParameter {
   // Specify the infogain matrix source.
   optional string source = 1;
+  optional int32 axis = 2 [default = 1]; // axis of prob
 }
 
 message InnerProductParameter {
@@ -910,6 +943,12 @@ message PoolingParameter {
   // If global_pooling then it will pool over the size of the bottom by doing
   // kernel_h = bottom->height and kernel_w = bottom->width
   optional bool global_pooling = 12 [default = false];
+  // How to calculate the output size - using ceil (default) or floor rounding.
+  enum RoundMode {
+    CEIL = 0;
+    FLOOR = 1;
+  }
+  optional RoundMode round_mode = 13 [default = CEIL];
 }
 
 message PowerParameter {
@@ -927,9 +966,7 @@ message PythonParameter {
   // string, dictionary in Python dict format, JSON, etc. You may parse this
   // string in `setup` method and use it in `forward` and `backward`.
   optional string param_str = 3 [default = ''];
-  // Whether this PythonLayer is shared among worker solvers during data parallelism.
-  // If true, each worker solver sequentially run forward from this layer.
-  // This value should be set true if you are using it as a data layer.
+  // DEPRECATED
   optional bool share_in_parallel = 4 [default = false];
 }
 
@@ -1133,6 +1170,15 @@ message SoftmaxParameter {
   optional int32 axis = 2 [default = 1];
 }
 
+// Message that stores parameters used by SwishLayer
+message SwishParameter {
+  // Beta parameter for the Swish activation function
+  // Described in:
+  // Prajit Ramachandran, Barret Zoph, Quoc V. Le. (2017). Searching for
+  // Activation Functions. https://arxiv.org/abs/1710.05941v2
+  optional float beta = 1 [default = 1];
+}
+
 message TanHParameter {
   enum Engine {
     DEFAULT = 0;
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index fd4c03724ef..842312e0b76 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -3,6 +3,7 @@
 #include <string>
 #include <vector>
 
+#include "boost/algorithm/string.hpp"
 #include "caffe/solver.hpp"
 #include "caffe/util/format.hpp"
 #include "caffe/util/hdf5.hpp"
@@ -51,19 +52,33 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
   }
   // Scaffolding code
   InitTrainNet();
+  InitTestNets();
   if (Caffe::root_solver()) {
-    InitTestNets();
     LOG(INFO) << "Solver scaffolding done.";
   }
   iter_ = 0;
   current_step_ = 0;
 }
 
+// Load weights from the caffemodel(s) specified in "weights" solver parameter
+// into the train and test nets.
+template <typename Dtype>
+void LoadNetWeights(shared_ptr<Net<Dtype> > net,
+    const std::string& model_list) {
+  std::vector<std::string> model_names;
+  boost::split(model_names, model_list, boost::is_any_of(","));
+  for (int i = 0; i < model_names.size(); ++i) {
+    boost::trim(model_names[i]);
+    LOG(INFO) << "Finetuning from " << model_names[i];
+    net->CopyTrainedLayersFrom(model_names[i]);
+  }
+}
+
 template <typename Dtype>
 void Solver<Dtype>::InitTrainNet() {
   const int num_train_nets = param_.has_net() + param_.has_net_param() +
       param_.has_train_net() + param_.has_train_net_param();
-  const string& field_names = "net, net_param, train_net, train_net_param";
+  const string field_names = "net, net_param, train_net, train_net_param";
   CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net "
       << "using one of these fields: " << field_names;
   CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than "
@@ -98,11 +113,13 @@ void Solver<Dtype>::InitTrainNet() {
   net_state.MergeFrom(param_.train_state());
   net_param.mutable_state()->CopyFrom(net_state);
   net_.reset(new Net<Dtype>(net_param));
+  for (int w_idx = 0; w_idx < param_.weights_size(); ++w_idx) {
+    LoadNetWeights(net_, param_.weights(w_idx));
+  }
 }
 
 template <typename Dtype>
 void Solver<Dtype>::InitTestNets() {
-  CHECK(Caffe::root_solver());
   const bool has_net_param = param_.has_net_param();
   const bool has_net_file = param_.has_net();
   const int num_generic_nets = has_net_param + has_net_file;
@@ -174,6 +191,9 @@ void Solver<Dtype>::InitTestNets() {
         << "Creating test net (#" << i << ") specified by " << sources[i];
     test_nets_[i].reset(new Net<Dtype>(net_params[i]));
     test_nets_[i]->set_debug_info(param_.debug_info());
+    for (int w_idx = 0; w_idx < param_.weights_size(); ++w_idx) {
+      LoadNetWeights(test_nets_[i], param_.weights(w_idx));
+    }
   }
 }
 
@@ -246,10 +266,6 @@ void Solver<Dtype>::Step(int iters) {
     }
     ApplyUpdate();
 
-    // Increment the internal iter_ counter -- its value should always indicate
-    // the number of times the weights have been updated.
-    ++iter_;
-
     SolverAction::Enum request = GetRequestedAction();
 
     // Save a snapshot if needed.
@@ -431,13 +447,13 @@ void Solver<Dtype>::CheckSnapshotWritePermissions() {
     } else {
       LOG(FATAL) << "Cannot write to snapshot prefix '"
           << param_.snapshot_prefix() << "'.  Make sure "
-          << "that the directory exists and is writeable.";
+          << "that the directory exists and is writable.";
     }
   }
 }
 
 template <typename Dtype>
-string Solver<Dtype>::SnapshotFilename(const string extension) {
+string Solver<Dtype>::SnapshotFilename(const string& extension) {
   return param_.snapshot_prefix() + "_iter_" + caffe::format_int(iter_)
     + extension;
 }
diff --git a/src/caffe/solvers/sgd_solver.cpp b/src/caffe/solvers/sgd_solver.cpp
index ad6abe54a0a..081c47eb514 100644
--- a/src/caffe/solvers/sgd_solver.cpp
+++ b/src/caffe/solvers/sgd_solver.cpp
@@ -30,12 +30,16 @@ Dtype SGDSolver<Dtype>::GetLearningRate() {
   if (lr_policy == "fixed") {
     rate = this->param_.base_lr();
   } else if (lr_policy == "step") {
+    CHECK_GT(this->param_.stepsize(), 0);
     this->current_step_ = this->iter_ / this->param_.stepsize();
+    CHECK_GE(this->param_.gamma(), 0);
     rate = this->param_.base_lr() *
         pow(this->param_.gamma(), this->current_step_);
   } else if (lr_policy == "exp") {
+    CHECK_GE(this->param_.gamma(), 0);
     rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_);
   } else if (lr_policy == "inv") {
+    CHECK_GE(this->param_.gamma(), 0);
     rate = this->param_.base_lr() *
         pow(Dtype(1) + this->param_.gamma() * this->iter_,
             - this->param_.power());
@@ -46,6 +50,7 @@ Dtype SGDSolver<Dtype>::GetLearningRate() {
       LOG(INFO) << "MultiStep Status: Iteration " <<
       this->iter_ << ", step = " << this->current_step_;
     }
+    CHECK_GE(this->param_.gamma(), 0);
     rate = this->param_.base_lr() *
         pow(this->param_.gamma(), this->current_step_);
   } else if (lr_policy == "poly") {
@@ -53,6 +58,8 @@ Dtype SGDSolver<Dtype>::GetLearningRate() {
         (Dtype(this->iter_) / Dtype(this->param_.max_iter())),
         this->param_.power());
   } else if (lr_policy == "sigmoid") {
+    CHECK_GE(this->param_.gamma(), 0);
+    CHECK_GT(this->param_.stepsize(), 0);
     rate = this->param_.base_lr() * (Dtype(1.) /
         (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) -
           Dtype(this->param_.stepsize())))));
@@ -113,6 +120,10 @@ void SGDSolver<Dtype>::ApplyUpdate() {
     ComputeUpdateValue(param_id, rate);
   }
   this->net_->Update();
+
+  // Increment the internal iter_ counter -- its value should always indicate
+  // the number of times the weights have been updated.
+  ++this->iter_;
 }
 
 template <typename Dtype>
@@ -278,6 +289,8 @@ void SGDSolver<Dtype>::SnapshotSolverStateToBinaryProto(
 template <typename Dtype>
 void SGDSolver<Dtype>::SnapshotSolverStateToHDF5(
     const string& model_filename) {
+// This code is taken from https://github.com/sh1r0/caffe-android-lib
+#ifdef USE_HDF5
   string snapshot_filename =
       Solver<Dtype>::SnapshotFilename(".solverstate.h5");
   LOG(INFO) << "Snapshotting solver state to HDF5 file " << snapshot_filename;
@@ -299,6 +312,11 @@ void SGDSolver<Dtype>::SnapshotSolverStateToHDF5(
   }
   H5Gclose(history_hid);
   H5Fclose(file_hid);
+// This code is taken from https://github.com/sh1r0/caffe-android-lib
+#else
+  LOG(FATAL) << "SnapshotSolverStateToHDF5 requires hdf5;"
+             << " compile with USE_HDF5.";
+#endif  // USE_HDF5
 }
 
 template <typename Dtype>
@@ -323,6 +341,7 @@ void SGDSolver<Dtype>::RestoreSolverStateFromBinaryProto(
 
 template <typename Dtype>
 void SGDSolver<Dtype>::RestoreSolverStateFromHDF5(const string& state_file) {
+#ifdef USE_HDF5
   hid_t file_hid = H5Fopen(state_file.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT);
   CHECK_GE(file_hid, 0) << "Couldn't open solver state file " << state_file;
   this->iter_ = hdf5_load_int(file_hid, "iter");
@@ -344,6 +363,10 @@ void SGDSolver<Dtype>::RestoreSolverStateFromHDF5(const string& state_file) {
   }
   H5Gclose(history_hid);
   H5Fclose(file_hid);
+#else
+  LOG(FATAL) << "RestoreSolverStateFromHDF5 requires hdf5;"
+             << " compile with USE_HDF5.";
+#endif  // USE_HDF5
 }
 
 INSTANTIATE_CLASS(SGDSolver);
diff --git a/src/caffe/test/test_accuracy_layer.cpp b/src/caffe/test/test_accuracy_layer.cpp
index 6fe808bd5c5..e5cc9d5e85f 100644
--- a/src/caffe/test/test_accuracy_layer.cpp
+++ b/src/caffe/test/test_accuracy_layer.cpp
@@ -13,8 +13,10 @@
 
 namespace caffe {
 
-template <typename Dtype>
-class AccuracyLayerTest : public CPUDeviceTest<Dtype> {
+template <typename TypeParam>
+class AccuracyLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
  protected:
   AccuracyLayerTest()
       : blob_bottom_data_(new Blob<Dtype>()),
@@ -69,11 +71,12 @@ class AccuracyLayerTest : public CPUDeviceTest<Dtype> {
   int top_k_;
 };
 
-TYPED_TEST_CASE(AccuracyLayerTest, TestDtypes);
+TYPED_TEST_CASE(AccuracyLayerTest, TestDtypesAndDevices);
 
 TYPED_TEST(AccuracyLayerTest, TestSetup) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_->num(), 1);
   EXPECT_EQ(this->blob_top_->channels(), 1);
@@ -82,11 +85,12 @@ TYPED_TEST(AccuracyLayerTest, TestSetup) {
 }
 
 TYPED_TEST(AccuracyLayerTest, TestSetupTopK) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   AccuracyParameter* accuracy_param =
       layer_param.mutable_accuracy_param();
   accuracy_param->set_top_k(5);
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
   EXPECT_EQ(this->blob_top_->num(), 1);
   EXPECT_EQ(this->blob_top_->channels(), 1);
@@ -95,8 +99,9 @@ TYPED_TEST(AccuracyLayerTest, TestSetupTopK) {
 }
 
 TYPED_TEST(AccuracyLayerTest, TestSetupOutputPerClass) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
   EXPECT_EQ(this->blob_top_->num(), 1);
   EXPECT_EQ(this->blob_top_->channels(), 1);
@@ -108,33 +113,39 @@ TYPED_TEST(AccuracyLayerTest, TestSetupOutputPerClass) {
   EXPECT_EQ(this->blob_top_per_class_->width(), 1);
 }
 
-TYPED_TEST(AccuracyLayerTest, TestForwardCPU) {
+TYPED_TEST(AccuracyLayerTest, TestForward) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  TypeParam max_value;
-  int max_id;
-  int num_correct_labels = 0;
-  for (int i = 0; i < 100; ++i) {
-    max_value = -FLT_MAX;
-    max_id = 0;
-    for (int j = 0; j < 10; ++j) {
-      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
-        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
-        max_id = j;
+
+  // repeat the forward
+  for (int iter = 0; iter < 3; iter++) {
+    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+    Dtype max_value;
+    int max_id;
+    int num_correct_labels = 0;
+    for (int i = 0; i < 100; ++i) {
+      max_value = -FLT_MAX;
+      max_id = 0;
+      for (int j = 0; j < 10; ++j) {
+        if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
+          max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+          max_id = j;
+        }
+      }
+      if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+        ++num_correct_labels;
       }
     }
-    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      ++num_correct_labels;
-    }
+    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+                num_correct_labels / Dtype(100.0), 1e-4);
   }
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / 100.0, 1e-4);
 }
 
 TYPED_TEST(AccuracyLayerTest, TestForwardWithSpatialAxes) {
+  typedef typename TypeParam::Dtype Dtype;
   this->blob_bottom_data_->Reshape(2, 10, 4, 5);
   vector<int> label_shape(3);
   label_shape[0] = 2; label_shape[1] = 4; label_shape[2] = 5;
@@ -142,195 +153,218 @@ TYPED_TEST(AccuracyLayerTest, TestForwardWithSpatialAxes) {
   this->FillBottoms();
   LayerParameter layer_param;
   layer_param.mutable_accuracy_param()->set_axis(1);
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  TypeParam max_value;
-  const int num_labels = this->blob_bottom_label_->count();
-  int max_id;
-  int num_correct_labels = 0;
-  vector<int> label_offset(3);
-  for (int n = 0; n < this->blob_bottom_data_->num(); ++n) {
-    for (int h = 0; h < this->blob_bottom_data_->height(); ++h) {
-      for (int w = 0; w < this->blob_bottom_data_->width(); ++w) {
-        max_value = -FLT_MAX;
-        max_id = 0;
-        for (int c = 0; c < this->blob_bottom_data_->channels(); ++c) {
-          const TypeParam pred_value =
-              this->blob_bottom_data_->data_at(n, c, h, w);
-          if (pred_value > max_value) {
-            max_value = pred_value;
-            max_id = c;
+
+  // repeat the forward
+  for (int iter = 0; iter < 3; iter++) {
+    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+    Dtype max_value;
+    const int num_labels = this->blob_bottom_label_->count();
+    int max_id;
+    int num_correct_labels = 0;
+    vector<int> label_offset(3);
+    for (int n = 0; n < this->blob_bottom_data_->num(); ++n) {
+      for (int h = 0; h < this->blob_bottom_data_->height(); ++h) {
+        for (int w = 0; w < this->blob_bottom_data_->width(); ++w) {
+          max_value = -FLT_MAX;
+          max_id = 0;
+          for (int c = 0; c < this->blob_bottom_data_->channels(); ++c) {
+            const Dtype pred_value =
+                this->blob_bottom_data_->data_at(n, c, h, w);
+            if (pred_value > max_value) {
+              max_value = pred_value;
+              max_id = c;
+            }
+          }
+          label_offset[0] = n; label_offset[1] = h; label_offset[2] = w;
+          const int correct_label =
+              static_cast<int>(this->blob_bottom_label_->data_at(label_offset));
+          if (max_id == correct_label) {
+            ++num_correct_labels;
           }
-        }
-        label_offset[0] = n; label_offset[1] = h; label_offset[2] = w;
-        const int correct_label =
-            static_cast<int>(this->blob_bottom_label_->data_at(label_offset));
-        if (max_id == correct_label) {
-          ++num_correct_labels;
         }
       }
     }
+    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+                num_correct_labels / Dtype(num_labels), 1e-4);
   }
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / TypeParam(num_labels), 1e-4);
 }
 
 TYPED_TEST(AccuracyLayerTest, TestForwardIgnoreLabel) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  const TypeParam kIgnoreLabelValue = -1;
+  const Dtype kIgnoreLabelValue = -1;
   layer_param.mutable_accuracy_param()->set_ignore_label(kIgnoreLabelValue);
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   // Manually set some labels to the ignore label value (-1).
   this->blob_bottom_label_->mutable_cpu_data()[2] = kIgnoreLabelValue;
   this->blob_bottom_label_->mutable_cpu_data()[5] = kIgnoreLabelValue;
   this->blob_bottom_label_->mutable_cpu_data()[32] = kIgnoreLabelValue;
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  TypeParam max_value;
-  int max_id;
-  int num_correct_labels = 0;
-  int count = 0;
-  for (int i = 0; i < 100; ++i) {
-    if (kIgnoreLabelValue == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      continue;
-    }
-    ++count;
-    max_value = -FLT_MAX;
-    max_id = 0;
-    for (int j = 0; j < 10; ++j) {
-      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
-        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
-        max_id = j;
+
+  // repeat the forward
+  for (int iter = 0; iter < 3; iter++) {
+    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+    Dtype max_value;
+    int max_id;
+    int num_correct_labels = 0;
+    int count = 0;
+    for (int i = 0; i < 100; ++i) {
+      if (kIgnoreLabelValue == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+        continue;
+      }
+      ++count;
+      max_value = -FLT_MAX;
+      max_id = 0;
+      for (int j = 0; j < 10; ++j) {
+        if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
+          max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+          max_id = j;
+        }
+      }
+      if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+        ++num_correct_labels;
       }
     }
-    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      ++num_correct_labels;
-    }
+    EXPECT_EQ(count, 97);  // We set 3 out of 100 labels to kIgnoreLabelValue.
+    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+                num_correct_labels / Dtype(count), 1e-4);
   }
-  EXPECT_EQ(count, 97);  // We set 3 out of 100 labels to kIgnoreLabelValue.
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / TypeParam(count), 1e-4);
 }
 
-TYPED_TEST(AccuracyLayerTest, TestForwardCPUTopK) {
+TYPED_TEST(AccuracyLayerTest, TestForwardTopK) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
   AccuracyParameter* accuracy_param = layer_param.mutable_accuracy_param();
   accuracy_param->set_top_k(this->top_k_);
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
-
-  TypeParam current_value;
-  int current_rank;
-  int num_correct_labels = 0;
-  for (int i = 0; i < 100; ++i) {
-    for (int j = 0; j < 10; ++j) {
-      current_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
-      current_rank = 0;
-      for (int k = 0; k < 10; ++k) {
-        if (this->blob_bottom_data_->data_at(i, k, 0, 0) > current_value) {
-          ++current_rank;
+
+  // repeat the forward
+  for (int iter = 0; iter < 3; iter++) {
+    layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+
+    Dtype current_value;
+    int current_rank;
+    int num_correct_labels = 0;
+    for (int i = 0; i < 100; ++i) {
+      for (int j = 0; j < 10; ++j) {
+        current_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+        current_rank = 0;
+        for (int k = 0; k < 10; ++k) {
+          if (this->blob_bottom_data_->data_at(i, k, 0, 0) > current_value) {
+            ++current_rank;
+          }
+        }
+        if (current_rank < this->top_k_ &&
+            j == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+          ++num_correct_labels;
         }
-      }
-      if (current_rank < this->top_k_ &&
-          j == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-        ++num_correct_labels;
       }
     }
-  }
 
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / 100.0, 1e-4);
+    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+                num_correct_labels / Dtype(100.0), 1e-4);
+  }
 }
 
-TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClass) {
+TYPED_TEST(AccuracyLayerTest, TestForwardPerClass) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
-
-  TypeParam max_value;
-  int max_id;
-  int num_correct_labels = 0;
-  const int num_class = this->blob_top_per_class_->num();
-  vector<int> correct_per_class(num_class, 0);
-  vector<int> num_per_class(num_class, 0);
-  for (int i = 0; i < 100; ++i) {
-    max_value = -FLT_MAX;
-    max_id = 0;
-    for (int j = 0; j < 10; ++j) {
-      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
-        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
-        max_id = j;
+  // repeat the forward
+  for (int iter = 0; iter < 3; iter++) {
+    layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
+
+    Dtype max_value;
+    int max_id;
+    int num_correct_labels = 0;
+    const int num_class = this->blob_top_per_class_->num();
+    vector<int> correct_per_class(num_class, 0);
+    vector<int> num_per_class(num_class, 0);
+    for (int i = 0; i < 100; ++i) {
+      max_value = -FLT_MAX;
+      max_id = 0;
+      for (int j = 0; j < 10; ++j) {
+        if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
+          max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+          max_id = j;
+        }
+      }
+      ++num_per_class[this->blob_bottom_label_->data_at(i, 0, 0, 0)];
+      if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+        ++num_correct_labels;
+        ++correct_per_class[max_id];
       }
     }
-    ++num_per_class[this->blob_bottom_label_->data_at(i, 0, 0, 0)];
-    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      ++num_correct_labels;
-      ++correct_per_class[max_id];
+    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+                num_correct_labels / 100.0, 1e-4);
+    for (int i = 0; i < num_class; ++i) {
+      Dtype accuracy_per_class = (num_per_class[i] > 0 ?
+         static_cast<Dtype>(correct_per_class[i]) / num_per_class[i] : 0);
+      EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
+                  accuracy_per_class, 1e-4);
     }
   }
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / 100.0, 1e-4);
-  for (int i = 0; i < num_class; ++i) {
-    TypeParam accuracy_per_class = (num_per_class[i] > 0 ?
-       static_cast<TypeParam>(correct_per_class[i]) / num_per_class[i] : 0);
-    EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
-                accuracy_per_class, 1e-4);
-  }
 }
 
 
-TYPED_TEST(AccuracyLayerTest, TestForwardCPUPerClassWithIgnoreLabel) {
+TYPED_TEST(AccuracyLayerTest, TestForwardPerClassWithIgnoreLabel) {
+  typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
-  const TypeParam kIgnoreLabelValue = -1;
+  const Dtype kIgnoreLabelValue = -1;
   layer_param.mutable_accuracy_param()->set_ignore_label(kIgnoreLabelValue);
-  AccuracyLayer<TypeParam> layer(layer_param);
+  AccuracyLayer<Dtype> layer(layer_param);
   // Manually set some labels to the ignore label value (-1).
   this->blob_bottom_label_->mutable_cpu_data()[2] = kIgnoreLabelValue;
   this->blob_bottom_label_->mutable_cpu_data()[5] = kIgnoreLabelValue;
   this->blob_bottom_label_->mutable_cpu_data()[32] = kIgnoreLabelValue;
   layer.SetUp(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
-  layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
-
-  TypeParam max_value;
-  int max_id;
-  int num_correct_labels = 0;
-  const int num_class = this->blob_top_per_class_->num();
-  vector<int> correct_per_class(num_class, 0);
-  vector<int> num_per_class(num_class, 0);
-  int count = 0;
-  for (int i = 0; i < 100; ++i) {
-    if (kIgnoreLabelValue == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      continue;
-    }
-    ++count;
-    max_value = -FLT_MAX;
-    max_id = 0;
-    for (int j = 0; j < 10; ++j) {
-      if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
-        max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
-        max_id = j;
+
+  // repeat the forward
+  for (int iter = 0; iter < 3; iter++) {
+    layer.Forward(this->blob_bottom_vec_, this->blob_top_per_class_vec_);
+
+    Dtype max_value;
+    int max_id;
+    int num_correct_labels = 0;
+    const int num_class = this->blob_top_per_class_->num();
+    vector<int> correct_per_class(num_class, 0);
+    vector<int> num_per_class(num_class, 0);
+    int count = 0;
+    for (int i = 0; i < 100; ++i) {
+      if (kIgnoreLabelValue == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+        continue;
+      }
+      ++count;
+      max_value = -FLT_MAX;
+      max_id = 0;
+      for (int j = 0; j < 10; ++j) {
+        if (this->blob_bottom_data_->data_at(i, j, 0, 0) > max_value) {
+          max_value = this->blob_bottom_data_->data_at(i, j, 0, 0);
+          max_id = j;
+        }
+      }
+      ++num_per_class[this->blob_bottom_label_->data_at(i, 0, 0, 0)];
+      if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
+        ++num_correct_labels;
+        ++correct_per_class[max_id];
       }
     }
-    ++num_per_class[this->blob_bottom_label_->data_at(i, 0, 0, 0)];
-    if (max_id == this->blob_bottom_label_->data_at(i, 0, 0, 0)) {
-      ++num_correct_labels;
-      ++correct_per_class[max_id];
+    EXPECT_EQ(count, 97);
+    EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
+                num_correct_labels / Dtype(count), 1e-4);
+    for (int i = 0; i < 10; ++i) {
+      Dtype accuracy_per_class = (num_per_class[i] > 0 ?
+         static_cast<Dtype>(correct_per_class[i]) / num_per_class[i] : 0);
+      EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
+                  accuracy_per_class, 1e-4);
     }
   }
-  EXPECT_EQ(count, 97);
-  EXPECT_NEAR(this->blob_top_->data_at(0, 0, 0, 0),
-              num_correct_labels / TypeParam(count), 1e-4);
-  for (int i = 0; i < 10; ++i) {
-    TypeParam accuracy_per_class = (num_per_class[i] > 0 ?
-       static_cast<TypeParam>(correct_per_class[i]) / num_per_class[i] : 0);
-    EXPECT_NEAR(this->blob_top_per_class_->data_at(i, 0, 0, 0),
-                accuracy_per_class, 1e-4);
-  }
 }
 
 }  // namespace caffe
diff --git a/src/caffe/test/test_caffe_main.cpp b/src/caffe/test/test_caffe_main.cpp
index 6473b74d0a6..8f333bd7105 100644
--- a/src/caffe/test/test_caffe_main.cpp
+++ b/src/caffe/test/test_caffe_main.cpp
@@ -15,7 +15,7 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   caffe::GlobalInit(&argc, &argv);
 #ifndef CPU_ONLY
-  // Before starting testing, let's first print out a few cuda defice info.
+  // Before starting testing, let's first print out a few cuda device info.
   int device;
   cudaGetDeviceCount(&device);
   cout << "Cuda number of devices: " << device << endl;
diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp
index 9bb19d13592..85c10a29483 100644
--- a/src/caffe/test/test_convolution_layer.cpp
+++ b/src/caffe/test/test_convolution_layer.cpp
@@ -695,7 +695,7 @@ TYPED_TEST(ConvolutionLayerTest, TestNDAgainst2D) {
   }
   ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count());
   for (int i = 0; i < backward_result_2d.count(); ++i) {
-    EXPECT_EQ(backward_result_2d.cpu_diff()[i],
+    EXPECT_FLOAT_EQ(backward_result_2d.cpu_diff()[i],
               backward_result_nd.cpu_diff()[i]);
   }
   ASSERT_EQ(backward_weight_result_nd.count(),
diff --git a/src/caffe/test/test_deconvolution_layer.cpp b/src/caffe/test/test_deconvolution_layer.cpp
index c4b09ad555a..1a022d6f7bc 100644
--- a/src/caffe/test/test_deconvolution_layer.cpp
+++ b/src/caffe/test/test_deconvolution_layer.cpp
@@ -5,6 +5,7 @@
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
 #include "caffe/filler.hpp"
+#include "caffe/layers/cudnn_deconv_layer.hpp"
 #include "caffe/layers/deconv_layer.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
@@ -301,4 +302,268 @@ TYPED_TEST(DeconvolutionLayerTest, TestGradient3D) {
       this->blob_top_vec_);
 }
 
+#ifdef USE_CUDNN
+
+// Since ConvolutionLayerTest checks the shared conv/deconv code in detail,
+// we'll just do a simple forward test and a gradient check.
+template <typename TypeParam>
+class CuDNNDeconvolutionLayerTest : public MultiDeviceTest<TypeParam> {
+  typedef typename TypeParam::Dtype Dtype;
+
+ protected:
+  CuDNNDeconvolutionLayerTest()
+      : blob_bottom_(new Blob<Dtype>(2, 3, 6, 4)),
+        blob_bottom_2_(new Blob<Dtype>(2, 3, 6, 4)),
+        blob_top_(new Blob<Dtype>()),
+        blob_top_2_(new Blob<Dtype>()) {}
+  virtual void SetUp() {
+    // fill the values
+    FillerParameter filler_param;
+    filler_param.set_value(1.);
+    GaussianFiller<Dtype> filler(filler_param);
+    filler.Fill(this->blob_bottom_);
+    filler.Fill(this->blob_bottom_2_);
+    blob_bottom_vec_.push_back(blob_bottom_);
+    blob_top_vec_.push_back(blob_top_);
+  }
+
+  virtual ~CuDNNDeconvolutionLayerTest() {
+    delete blob_bottom_;
+    delete blob_bottom_2_;
+    delete blob_top_;
+    delete blob_top_2_;
+  }
+
+  Blob<Dtype>* const blob_bottom_;
+  Blob<Dtype>* const blob_bottom_2_;
+  Blob<Dtype>* const blob_top_;
+  Blob<Dtype>* const blob_top_2_;
+  vector<Blob<Dtype>*> blob_bottom_vec_;
+  vector<Blob<Dtype>*> blob_top_vec_;
+};
+
+TYPED_TEST_CASE(CuDNNDeconvolutionLayerTest, TestDtypesAndDevices);
+
+TYPED_TEST(CuDNNDeconvolutionLayerTest, TestSetup) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->set_num_output(4);
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  shared_ptr<Layer<Dtype> > layer(
+      new CuDNNDeconvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 4);
+  EXPECT_EQ(this->blob_top_->height(), 13);
+  EXPECT_EQ(this->blob_top_->width(), 9);
+  EXPECT_EQ(this->blob_top_2_->num(), 2);
+  EXPECT_EQ(this->blob_top_2_->channels(), 4);
+  EXPECT_EQ(this->blob_top_2_->height(), 13);
+  EXPECT_EQ(this->blob_top_2_->width(), 9);
+  // setting group should not change the shape
+  convolution_param->set_num_output(3);
+  convolution_param->set_group(3);
+  layer.reset(new CuDNNDeconvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  EXPECT_EQ(this->blob_top_->num(), 2);
+  EXPECT_EQ(this->blob_top_->channels(), 3);
+  EXPECT_EQ(this->blob_top_->height(), 13);
+  EXPECT_EQ(this->blob_top_->width(), 9);
+  EXPECT_EQ(this->blob_top_2_->num(), 2);
+  EXPECT_EQ(this->blob_top_2_->channels(), 3);
+  EXPECT_EQ(this->blob_top_2_->height(), 13);
+  EXPECT_EQ(this->blob_top_2_->width(), 9);
+}
+
+TYPED_TEST(CuDNNDeconvolutionLayerTest, TestSimpleCuDNNDeconvolution) {
+  typedef typename TypeParam::Dtype Dtype;
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->add_kernel_size(3);
+  convolution_param->add_stride(2);
+  convolution_param->set_num_output(4);
+  convolution_param->mutable_weight_filler()->set_type("constant");
+  convolution_param->mutable_weight_filler()->set_value(1);
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<Dtype> > layer(
+      new CuDNNDeconvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  // constant-fill the bottom blobs
+  FillerParameter filler_param;
+  filler_param.set_value(1.);
+  ConstantFiller<Dtype> filler(filler_param);
+  filler.Fill(this->blob_bottom_);
+  filler.Fill(this->blob_bottom_2_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // simply check that accumulation works with overlapping filters
+  const Dtype* top_data = this->blob_top_->cpu_data();
+  for (int n = 0; n < this->blob_top_->num(); ++n) {
+    for (int c = 0; c < this->blob_top_->channels(); ++c) {
+      for (int h = 0; h < this->blob_top_->height(); ++h) {
+        for (int w = 0; w < this->blob_top_->width(); ++w) {
+          Dtype expected = 3.1;
+          bool h_overlap = h % 2 == 0 && h > 0
+            && h < this->blob_top_->height() - 1;
+          bool w_overlap = w % 2 == 0 && w > 0
+            && w < this->blob_top_->width() - 1;
+          if (h_overlap && w_overlap) {
+            expected += 9;
+          } else if (h_overlap || w_overlap) {
+            expected += 3;
+          }
+          EXPECT_NEAR(top_data[this->blob_top_->offset(n, c, h, w)],
+              expected, 1e-4);
+        }
+      }
+    }
+  }
+}
+
+TYPED_TEST(CuDNNDeconvolutionLayerTest, TestGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  convolution_param->add_kernel_size(2);
+  convolution_param->add_stride(1);
+  convolution_param->set_num_output(1);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  CuDNNDeconvolutionLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(CuDNNDeconvolutionLayerTest, TestNDAgainst2D) {
+  typedef typename TypeParam::Dtype Dtype;
+  const int kernel_h = 11;
+  const int kernel_w = 13;
+  vector<int> bottom_shape(4);
+  bottom_shape[0] = 15;
+  bottom_shape[1] = 12;
+  bottom_shape[2] = kernel_h * 2;
+  bottom_shape[3] = kernel_w * 2;
+  FillerParameter filler_param;
+  GaussianFiller<Dtype> filler(filler_param);
+  for (int i = 0; i < this->blob_bottom_vec_.size(); ++i) {
+    this->blob_bottom_vec_[i]->Reshape(bottom_shape);
+    filler.Fill(this->blob_bottom_vec_[i]);
+  }
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->set_num_output(18);
+  convolution_param->set_bias_term(false);
+  convolution_param->set_group(6);
+  convolution_param->set_kernel_h(kernel_h);
+  convolution_param->set_kernel_w(kernel_w);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  Blob<Dtype> weights;
+  Blob<Dtype> top_diff;
+  // Shape and fill weights and top_diff.
+  bool copy_diff;
+  bool reshape;
+  {
+    CuDNNDeconvolutionLayer<Dtype> layer(layer_param);
+    layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    top_diff.ReshapeLike(*this->blob_top_);
+    filler.Fill(&top_diff);
+    ASSERT_EQ(1, layer.blobs().size());
+    copy_diff = false; reshape = true;
+    weights.CopyFrom(*layer.blobs()[0], copy_diff, reshape);
+  }
+  vector<bool> propagate_down(1, true);
+  Blob<Dtype> result_2d;
+  Blob<Dtype> backward_result_2d;
+  Blob<Dtype> backward_weight_result_2d;
+  // Test with 2D im2col
+  {
+    caffe_set(this->blob_top_->count(), Dtype(0),
+              this->blob_top_->mutable_cpu_data());
+    caffe_set(this->blob_bottom_->count(), Dtype(0),
+              this->blob_bottom_->mutable_cpu_diff());
+    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
+    // Do SetUp and Forward; save Forward result in result_2d.
+    convolution_param->set_force_nd_im2col(false);
+    CuDNNDeconvolutionLayer<Dtype> layer_2d(layer_param);
+    layer_2d.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    ASSERT_EQ(1, layer_2d.blobs().size());
+    copy_diff = false; reshape = false;
+    layer_2d.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
+    layer_2d.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    copy_diff = false; reshape = true;
+    result_2d.CopyFrom(*this->blob_top_, copy_diff, reshape);
+    // Copy pre-generated top diff into actual top diff;
+    // do Backward and save result in backward_result_2d.
+    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
+    caffe_copy(top_diff.count(), top_diff.cpu_data(),
+               this->blob_top_->mutable_cpu_diff());
+    layer_2d.Backward(this->blob_top_vec_, propagate_down,
+                      this->blob_bottom_vec_);
+    copy_diff = true; reshape = true;
+    backward_result_2d.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
+    backward_weight_result_2d.CopyFrom(weights, copy_diff, reshape);
+  }
+  Blob<Dtype> result_nd;
+  Blob<Dtype> backward_result_nd;
+  Blob<Dtype> backward_weight_result_nd;
+  // Test with ND im2col
+  {
+    caffe_set(this->blob_top_->count(), Dtype(0),
+              this->blob_top_->mutable_cpu_data());
+    caffe_set(this->blob_bottom_->count(), Dtype(0),
+              this->blob_bottom_->mutable_cpu_diff());
+    caffe_set(weights.count(), Dtype(0), weights.mutable_cpu_diff());
+    // Do SetUp and Forward; save Forward result in result_nd.
+    convolution_param->set_force_nd_im2col(true);
+    CuDNNDeconvolutionLayer<Dtype> layer_nd(layer_param);
+    layer_nd.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+    ASSERT_EQ(1, layer_nd.blobs().size());
+    copy_diff = false; reshape = false;
+    layer_nd.blobs()[0]->CopyFrom(weights, copy_diff, reshape);
+    layer_nd.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+    copy_diff = false; reshape = true;
+    result_nd.CopyFrom(*this->blob_top_, copy_diff, reshape);
+    // Copy pre-generated top diff into actual top diff;
+    // do Backward and save result in backward_result_nd.
+    ASSERT_EQ(this->blob_top_->shape(), top_diff.shape());
+    caffe_copy(top_diff.count(), top_diff.cpu_data(),
+               this->blob_top_->mutable_cpu_diff());
+    layer_nd.Backward(this->blob_top_vec_, propagate_down,
+                      this->blob_bottom_vec_);
+    copy_diff = true; reshape = true;
+    backward_result_nd.CopyFrom(*this->blob_bottom_, copy_diff, reshape);
+    backward_weight_result_nd.CopyFrom(weights, copy_diff, reshape);
+  }
+  ASSERT_EQ(result_nd.count(), result_2d.count());
+  for (int i = 0; i < result_2d.count(); ++i)  {
+    EXPECT_NEAR(result_2d.cpu_data()[i], result_nd.cpu_data()[i], 1e-4);
+  }
+  ASSERT_EQ(backward_result_nd.count(), backward_result_2d.count());
+  for (int i = 0; i < backward_result_2d.count(); ++i) {
+    EXPECT_EQ(backward_result_2d.cpu_diff()[i],
+              backward_result_nd.cpu_diff()[i]);
+  }
+  ASSERT_EQ(backward_weight_result_nd.count(),
+            backward_weight_result_2d.count());
+  for (int i = 0; i < backward_weight_result_2d.count(); ++i) {
+    EXPECT_EQ(backward_weight_result_2d.cpu_diff()[i],
+              backward_weight_result_nd.cpu_diff()[i]);
+  }
+}
+
+#endif
+
 }  // namespace caffe
diff --git a/src/caffe/test/test_filler.cpp b/src/caffe/test/test_filler.cpp
index 26e9b217e35..34f7007d9c7 100644
--- a/src/caffe/test/test_filler.cpp
+++ b/src/caffe/test/test_filler.cpp
@@ -1,3 +1,5 @@
+#include <vector>
+
 #include "gtest/gtest.h"
 
 #include "caffe/filler.hpp"
@@ -10,11 +12,20 @@ template <typename Dtype>
 class ConstantFillerTest : public ::testing::Test {
  protected:
   ConstantFillerTest()
-      : blob_(new Blob<Dtype>(2, 3, 4, 5)),
+      : blob_(new Blob<Dtype>()),
         filler_param_() {
     filler_param_.set_value(10.);
     filler_.reset(new ConstantFiller<Dtype>(filler_param_));
+  }
+  virtual void test_params(const vector<int>& shape) {
+    EXPECT_TRUE(blob_);
+    blob_->Reshape(shape);
     filler_->Fill(blob_);
+    const int count = blob_->count();
+    const Dtype* data = blob_->cpu_data();
+    for (int i = 0; i < count; ++i) {
+      EXPECT_EQ(data[i], filler_param_.value());
+    }
   }
   virtual ~ConstantFillerTest() { delete blob_; }
   Blob<Dtype>* const blob_;
@@ -25,12 +36,34 @@ class ConstantFillerTest : public ::testing::Test {
 TYPED_TEST_CASE(ConstantFillerTest, TestDtypes);
 
 TYPED_TEST(ConstantFillerTest, TestFill) {
-  EXPECT_TRUE(this->blob_);
-  const int count = this->blob_->count();
-  const TypeParam* data = this->blob_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_GE(data[i], this->filler_param_.value());
-  }
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(ConstantFillerTest, TestFill1D) {
+  vector<int> blob_shape(1, 15);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(ConstantFillerTest, TestFill2D) {
+  vector<int> blob_shape;
+  blob_shape.push_back(8);
+  blob_shape.push_back(3);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(ConstantFillerTest, TestFill5D) {
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  blob_shape.push_back(2);
+  this->test_params(blob_shape);
 }
 
 
@@ -38,12 +71,22 @@ template <typename Dtype>
 class UniformFillerTest : public ::testing::Test {
  protected:
   UniformFillerTest()
-      : blob_(new Blob<Dtype>(2, 3, 4, 5)),
+      : blob_(new Blob<Dtype>()),
         filler_param_() {
     filler_param_.set_min(1.);
     filler_param_.set_max(2.);
     filler_.reset(new UniformFiller<Dtype>(filler_param_));
+  }
+  virtual void test_params(const vector<int>& shape) {
+    EXPECT_TRUE(blob_);
+    blob_->Reshape(shape);
     filler_->Fill(blob_);
+    const int count = blob_->count();
+    const Dtype* data = blob_->cpu_data();
+    for (int i = 0; i < count; ++i) {
+      EXPECT_GE(data[i], filler_param_.min());
+      EXPECT_LE(data[i], filler_param_.max());
+    }
   }
   virtual ~UniformFillerTest() { delete blob_; }
   Blob<Dtype>* const blob_;
@@ -54,23 +97,64 @@ class UniformFillerTest : public ::testing::Test {
 TYPED_TEST_CASE(UniformFillerTest, TestDtypes);
 
 TYPED_TEST(UniformFillerTest, TestFill) {
-  EXPECT_TRUE(this->blob_);
-  const int count = this->blob_->count();
-  const TypeParam* data = this->blob_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_GE(data[i], this->filler_param_.min());
-    EXPECT_LE(data[i], this->filler_param_.max());
-  }
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(UniformFillerTest, TestFill1D) {
+  vector<int> blob_shape(1, 15);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(UniformFillerTest, TestFill2D) {
+  vector<int> blob_shape;
+  blob_shape.push_back(8);
+  blob_shape.push_back(3);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(UniformFillerTest, TestFill5D) {
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  blob_shape.push_back(2);
+  this->test_params(blob_shape);
 }
 
 template <typename Dtype>
 class PositiveUnitballFillerTest : public ::testing::Test {
  protected:
   PositiveUnitballFillerTest()
-      : blob_(new Blob<Dtype>(2, 3, 4, 5)),
+      : blob_(new Blob<Dtype>()),
         filler_param_() {
     filler_.reset(new PositiveUnitballFiller<Dtype>(filler_param_));
+  }
+  virtual void test_params(const vector<int>& shape) {
+    EXPECT_TRUE(blob_);
+    blob_->Reshape(shape);
     filler_->Fill(blob_);
+    const int num = blob_->shape(0);
+    const int count = blob_->count();
+    const int dim = count / num;
+    const Dtype* data = blob_->cpu_data();
+    for (int i = 0; i < count; ++i) {
+      EXPECT_GE(data[i], 0);
+      EXPECT_LE(data[i], 1);
+    }
+    for (int i = 0; i < num; ++i) {
+      Dtype sum = Dtype(0);
+      for (int j = 0; j < dim; ++j) {
+        sum += data[i * dim + j];
+      }
+      EXPECT_GE(sum, 0.999);
+      EXPECT_LE(sum, 1.001);
+    }
   }
   virtual ~PositiveUnitballFillerTest() { delete blob_; }
   Blob<Dtype>* const blob_;
@@ -81,35 +165,78 @@ class PositiveUnitballFillerTest : public ::testing::Test {
 TYPED_TEST_CASE(PositiveUnitballFillerTest, TestDtypes);
 
 TYPED_TEST(PositiveUnitballFillerTest, TestFill) {
-  EXPECT_TRUE(this->blob_);
-  const int num = this->blob_->num();
-  const int count = this->blob_->count();
-  const int dim = count / num;
-  const TypeParam* data = this->blob_->cpu_data();
-  for (int i = 0; i < count; ++i) {
-    EXPECT_GE(data[i], 0);
-    EXPECT_LE(data[i], 1);
-  }
-  for (int i = 0; i < num; ++i) {
-    TypeParam sum = 0;
-    for (int j = 0; j < dim; ++j) {
-      sum += data[i * dim + j];
-    }
-    EXPECT_GE(sum, 0.999);
-    EXPECT_LE(sum, 1.001);
-  }
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(PositiveUnitballFillerTest, TestFill1D) {
+  vector<int> blob_shape(1, 15);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(PositiveUnitballFillerTest, TestFill2D) {
+  vector<int> blob_shape;
+  blob_shape.push_back(8);
+  blob_shape.push_back(3);
+  this->test_params(blob_shape);
+}
+
+TYPED_TEST(PositiveUnitballFillerTest, TestFill5D) {
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  blob_shape.push_back(2);
+  this->test_params(blob_shape);
 }
 
 template <typename Dtype>
 class GaussianFillerTest : public ::testing::Test {
  protected:
   GaussianFillerTest()
-      : blob_(new Blob<Dtype>(2, 3, 4, 5)),
+      : blob_(new Blob<Dtype>()),
         filler_param_() {
     filler_param_.set_mean(10.);
     filler_param_.set_std(0.1);
     filler_.reset(new GaussianFiller<Dtype>(filler_param_));
+  }
+  virtual void test_params(const vector<int>& shape,
+      const Dtype tolerance = Dtype(5), const int repetitions = 100) {
+    // Tests for statistical properties should be ran multiple times.
+    EXPECT_TRUE(blob_);
+    blob_->Reshape(shape);
+    for (int i = 0; i < repetitions; ++i) {
+      test_params_iter(shape, tolerance);
+    }
+  }
+  virtual void test_params_iter(const vector<int>& shape,
+      const Dtype tolerance) {
+    // This test has a configurable tolerance parameter - by default it was
+    // equal to 5.0 which is very loose - allowing some tuning (e.g. for tests
+    // on smaller blobs the actual variance will be larger than desired, so the
+    // tolerance can be increased to account for that).
     filler_->Fill(blob_);
+    const int count = blob_->count();
+    const Dtype* data = blob_->cpu_data();
+    Dtype mean = Dtype(0);
+    Dtype var = Dtype(0);
+    for (int i = 0; i < count; ++i) {
+      mean += data[i];
+      var += data[i] * data[i];
+    }
+    mean /= count;
+    var /= count;
+    var -= mean*mean;
+    EXPECT_GE(mean, filler_param_.mean() - filler_param_.std() * tolerance);
+    EXPECT_LE(mean, filler_param_.mean() + filler_param_.std() * tolerance);
+    Dtype target_var = filler_param_.std() * filler_param_.std();
+    EXPECT_GE(var, target_var / tolerance);
+    EXPECT_LE(var, target_var * tolerance);
   }
   virtual ~GaussianFillerTest() { delete blob_; }
   Blob<Dtype>* const blob_;
@@ -120,41 +247,62 @@ class GaussianFillerTest : public ::testing::Test {
 TYPED_TEST_CASE(GaussianFillerTest, TestDtypes);
 
 TYPED_TEST(GaussianFillerTest, TestFill) {
-  EXPECT_TRUE(this->blob_);
-  const int count = this->blob_->count();
-  const TypeParam* data = this->blob_->cpu_data();
-  TypeParam mean = 0.;
-  TypeParam var = 0.;
-  for (int i = 0; i < count; ++i) {
-    mean += data[i];
-    var += (data[i] - this->filler_param_.mean()) *
-        (data[i] - this->filler_param_.mean());
-  }
-  mean /= count;
-  var /= count;
-  // Very loose test.
-  EXPECT_GE(mean, this->filler_param_.mean() - this->filler_param_.std() * 5);
-  EXPECT_LE(mean, this->filler_param_.mean() + this->filler_param_.std() * 5);
-  TypeParam target_var = this->filler_param_.std() * this->filler_param_.std();
-  EXPECT_GE(var, target_var / 5.);
-  EXPECT_LE(var, target_var * 5.);
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  const TypeParam tolerance = TypeParam(3);  // enough for a 120-element blob
+  this->test_params(blob_shape, tolerance);
+}
+
+TYPED_TEST(GaussianFillerTest, TestFill1D) {
+  vector<int> blob_shape(1, 125);
+  const TypeParam tolerance = TypeParam(3);
+  this->test_params(blob_shape, tolerance);
+}
+
+TYPED_TEST(GaussianFillerTest, TestFill2D) {
+  vector<int> blob_shape;
+  blob_shape.push_back(8);
+  blob_shape.push_back(15);
+  const TypeParam tolerance = TypeParam(3);
+  this->test_params(blob_shape, tolerance);
+}
+
+TYPED_TEST(GaussianFillerTest, TestFill5D) {
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  blob_shape.push_back(2);
+  const TypeParam tolerance = TypeParam(2);
+  this->test_params(blob_shape, tolerance);
 }
 
 template <typename Dtype>
 class XavierFillerTest : public ::testing::Test {
  protected:
   XavierFillerTest()
-      : blob_(new Blob<Dtype>(1000, 2, 4, 5)),
+      : blob_(new Blob<Dtype>()),
         filler_param_() {
   }
   virtual void test_params(FillerParameter_VarianceNorm variance_norm,
+      Dtype n, const vector<int>& shape, const int repetitions = 100) {
+    EXPECT_TRUE(blob_);
+    blob_->Reshape(shape);
+    for (int i = 0; i < repetitions; ++i) {
+      test_params_iter(variance_norm, n);
+    }
+  }
+  virtual void test_params_iter(FillerParameter_VarianceNorm variance_norm,
       Dtype n) {
-    this->filler_param_.set_variance_norm(variance_norm);
-    this->filler_.reset(new XavierFiller<Dtype>(this->filler_param_));
-    this->filler_->Fill(blob_);
-    EXPECT_TRUE(this->blob_);
-    const int count = this->blob_->count();
-    const Dtype* data = this->blob_->cpu_data();
+    filler_param_.set_variance_norm(variance_norm);
+    filler_.reset(new XavierFiller<Dtype>(filler_param_));
+    filler_->Fill(blob_);
+    const int count = blob_->count();
+    const Dtype* data = blob_->cpu_data();
     Dtype mean = 0.;
     Dtype ex2 = 0.;
     for (int i = 0; i < count; ++i) {
@@ -177,33 +325,92 @@ class XavierFillerTest : public ::testing::Test {
 TYPED_TEST_CASE(XavierFillerTest, TestDtypes);
 
 TYPED_TEST(XavierFillerTest, TestFillFanIn) {
+  vector<int> blob_shape;
+  blob_shape.push_back(1000);
+  blob_shape.push_back(2);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
   TypeParam n = 2*4*5;
-  this->test_params(FillerParameter_VarianceNorm_FAN_IN, n);
+  this->test_params(FillerParameter_VarianceNorm_FAN_IN, n, blob_shape);
 }
+
 TYPED_TEST(XavierFillerTest, TestFillFanOut) {
+  vector<int> blob_shape;
+  blob_shape.push_back(1000);
+  blob_shape.push_back(2);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
   TypeParam n = 1000*4*5;
-  this->test_params(FillerParameter_VarianceNorm_FAN_OUT, n);
+  this->test_params(FillerParameter_VarianceNorm_FAN_OUT, n, blob_shape);
 }
+
 TYPED_TEST(XavierFillerTest, TestFillAverage) {
+  vector<int> blob_shape;
+  blob_shape.push_back(1000);
+  blob_shape.push_back(2);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
   TypeParam n = (2*4*5 + 1000*4*5) / 2.0;
-  this->test_params(FillerParameter_VarianceNorm_AVERAGE, n);
+  this->test_params(FillerParameter_VarianceNorm_AVERAGE, n, blob_shape);
+}
+
+TYPED_TEST(XavierFillerTest, TestFill1D) {
+  // This makes little sense but at least we will know that we can fill it
+  EXPECT_TRUE(this->blob_);
+  vector<int> blob_shape(1, 25);
+  this->blob_->Reshape(blob_shape);
+  this->filler_param_.set_variance_norm(FillerParameter_VarianceNorm_AVERAGE);
+  this->filler_.reset(new XavierFiller<TypeParam>(this->filler_param_));
+  this->filler_->Fill(this->blob_);
+}
+
+TYPED_TEST(XavierFillerTest, TestFill2D) {
+  EXPECT_TRUE(this->blob_);
+  vector<int> blob_shape;
+  blob_shape.push_back(8);
+  blob_shape.push_back(3);
+  this->blob_->Reshape(blob_shape);
+  this->filler_param_.set_variance_norm(FillerParameter_VarianceNorm_AVERAGE);
+  this->filler_.reset(new XavierFiller<TypeParam>(this->filler_param_));
+  this->filler_->Fill(this->blob_);
+}
+
+TYPED_TEST(XavierFillerTest, TestFill5D) {
+  EXPECT_TRUE(this->blob_);
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  blob_shape.push_back(2);
+  this->blob_->Reshape(blob_shape);
+  this->filler_param_.set_variance_norm(FillerParameter_VarianceNorm_AVERAGE);
+  this->filler_.reset(new XavierFiller<TypeParam>(this->filler_param_));
+  this->filler_->Fill(this->blob_);
 }
 
 template <typename Dtype>
 class MSRAFillerTest : public ::testing::Test {
  protected:
   MSRAFillerTest()
-      : blob_(new Blob<Dtype>(1000, 2, 4, 5)),
+      : blob_(new Blob<Dtype>()),
         filler_param_() {
   }
   virtual void test_params(FillerParameter_VarianceNorm variance_norm,
+      Dtype n, const vector<int>& shape, const int repetitions = 100) {
+    EXPECT_TRUE(blob_);
+    blob_->Reshape(shape);
+    for (int i = 0; i < repetitions; ++i) {
+      test_params_iter(variance_norm, n);
+    }
+  }
+  virtual void test_params_iter(FillerParameter_VarianceNorm variance_norm,
       Dtype n) {
-    this->filler_param_.set_variance_norm(variance_norm);
-    this->filler_.reset(new MSRAFiller<Dtype>(this->filler_param_));
-    this->filler_->Fill(blob_);
-    EXPECT_TRUE(this->blob_);
-    const int count = this->blob_->count();
-    const Dtype* data = this->blob_->cpu_data();
+    filler_param_.set_variance_norm(variance_norm);
+    filler_.reset(new MSRAFiller<Dtype>(filler_param_));
+    filler_->Fill(blob_);
+    const int count = blob_->count();
+    const Dtype* data = blob_->cpu_data();
     Dtype mean = 0.;
     Dtype ex2 = 0.;
     for (int i = 0; i < count; ++i) {
@@ -226,16 +433,123 @@ class MSRAFillerTest : public ::testing::Test {
 TYPED_TEST_CASE(MSRAFillerTest, TestDtypes);
 
 TYPED_TEST(MSRAFillerTest, TestFillFanIn) {
+  vector<int> blob_shape;
+  blob_shape.push_back(1000);
+  blob_shape.push_back(2);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
   TypeParam n = 2*4*5;
-  this->test_params(FillerParameter_VarianceNorm_FAN_IN, n);
+  this->test_params(FillerParameter_VarianceNorm_FAN_IN, n, blob_shape);
 }
+
 TYPED_TEST(MSRAFillerTest, TestFillFanOut) {
+  vector<int> blob_shape;
+  blob_shape.push_back(1000);
+  blob_shape.push_back(2);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
   TypeParam n = 1000*4*5;
-  this->test_params(FillerParameter_VarianceNorm_FAN_OUT, n);
+  this->test_params(FillerParameter_VarianceNorm_FAN_OUT, n, blob_shape);
 }
+
 TYPED_TEST(MSRAFillerTest, TestFillAverage) {
+  vector<int> blob_shape;
+  blob_shape.push_back(1000);
+  blob_shape.push_back(2);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
   TypeParam n = (2*4*5 + 1000*4*5) / 2.0;
-  this->test_params(FillerParameter_VarianceNorm_AVERAGE, n);
+  this->test_params(FillerParameter_VarianceNorm_AVERAGE, n, blob_shape);
+}
+
+TYPED_TEST(MSRAFillerTest, TestFill1D) {
+  // Like with Xavier - no checking for correctness, just if it can be filled.
+  EXPECT_TRUE(this->blob_);
+  vector<int> blob_shape(1, 25);
+  this->blob_->Reshape(blob_shape);
+  this->filler_param_.set_variance_norm(FillerParameter_VarianceNorm_AVERAGE);
+  this->filler_.reset(new MSRAFiller<TypeParam>(this->filler_param_));
+  this->filler_->Fill(this->blob_);
+}
+
+TYPED_TEST(MSRAFillerTest, TestFill2D) {
+  EXPECT_TRUE(this->blob_);
+  vector<int> blob_shape;
+  blob_shape.push_back(8);
+  blob_shape.push_back(3);
+  this->blob_->Reshape(blob_shape);
+  this->filler_param_.set_variance_norm(FillerParameter_VarianceNorm_AVERAGE);
+  this->filler_.reset(new MSRAFiller<TypeParam>(this->filler_param_));
+  this->filler_->Fill(this->blob_);
+}
+
+TYPED_TEST(MSRAFillerTest, TestFill5D) {
+  EXPECT_TRUE(this->blob_);
+  vector<int> blob_shape;
+  blob_shape.push_back(2);
+  blob_shape.push_back(3);
+  blob_shape.push_back(4);
+  blob_shape.push_back(5);
+  blob_shape.push_back(2);
+  this->blob_->Reshape(blob_shape);
+  this->filler_param_.set_variance_norm(FillerParameter_VarianceNorm_AVERAGE);
+  this->filler_.reset(new MSRAFiller<TypeParam>(this->filler_param_));
+  this->filler_->Fill(this->blob_);
+}
+
+template <typename Dtype>
+class BilinearFillerTest : public ::testing::Test {
+ protected:
+  BilinearFillerTest()
+    : blob_(new Blob<Dtype>()),
+      filler_param_() {
+  }
+  virtual void test_params(const vector<int>& shape) {
+    EXPECT_TRUE(blob_);
+    blob_->Reshape(shape);
+    filler_.reset(new BilinearFiller<Dtype>(filler_param_));
+    filler_->Fill(blob_);
+    CHECK_EQ(blob_->num_axes(), 4);
+    const int outer_num = blob_->count(0, 2);
+    const int inner_num = blob_->count(2, 4);
+    const Dtype* data = blob_->cpu_data();
+    int f = ceil(blob_->shape(3) / 2.);
+    Dtype c = (blob_->shape(3) - 1) / (2. * f);
+    for (int i = 0; i < outer_num; ++i) {
+      for (int j = 0; j < inner_num; ++j) {
+        Dtype x = j % blob_->shape(3);
+        Dtype y = (j / blob_->shape(3)) % blob_->shape(2);
+        Dtype expected_value = (1 - fabs(x / f - c)) * (1 - fabs(y / f - c));
+        const Dtype actual_value = data[i * inner_num + j];
+        EXPECT_NEAR(expected_value, actual_value, 0.01);
+      }
+    }
+  }
+  virtual ~BilinearFillerTest() { delete blob_; }
+  Blob<Dtype>* blob_;
+  FillerParameter filler_param_;
+  shared_ptr<BilinearFiller<Dtype> > filler_;
+};
+
+TYPED_TEST_CASE(BilinearFillerTest, TestDtypes);
+
+TYPED_TEST(BilinearFillerTest, TestFillOdd) {
+  const int n = 7;
+  vector<int> blob_shape;
+  blob_shape.push_back(1000);
+  blob_shape.push_back(2);
+  blob_shape.push_back(n);
+  blob_shape.push_back(n);
+  this->test_params(blob_shape);
+}
+TYPED_TEST(BilinearFillerTest, TestFillEven) {
+  const int n = 6;
+  vector<int> blob_shape;
+  blob_shape.push_back(1000);
+  blob_shape.push_back(2);
+  blob_shape.push_back(n);
+  blob_shape.push_back(n);
+  this->test_params(blob_shape);
 }
 
 }  // namespace caffe
diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp
index 6ad0d8f6544..f4395f5311c 100644
--- a/src/caffe/test/test_gradient_based_solver.cpp
+++ b/src/caffe/test/test_gradient_based_solver.cpp
@@ -28,7 +28,7 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
       seed_(1701), num_(4), channels_(3), height_(10), width_(10),
       share_(false) {
         input_file_ = new string(
-        CMAKE_SOURCE_DIR "caffe/test/test_data/solver_data_list.txt" CMAKE_EXT);
+        ABS_TEST_DATA_DIR "/solver_data_list.txt");
       }
   ~GradientBasedSolverTest() {
     delete input_file_;
@@ -558,9 +558,11 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const vector<Blob<Dtype>*>& params = solver_->net()->learnable_params();
     for (int i = 0; i < params.size(); ++i) {
       for (int j = 0; j < params[i]->count(); ++j) {
-        EXPECT_EQ(param_copies[i]->cpu_data()[j], params[i]->cpu_data()[j])
+        EXPECT_FLOAT_EQ(param_copies[i]->cpu_data()[j],
+            params[i]->cpu_data()[j])
             << "param " << i << " data differed at dim " << j;
-        EXPECT_EQ(param_copies[i]->cpu_diff()[j], params[i]->cpu_diff()[j])
+        EXPECT_FLOAT_EQ(param_copies[i]->cpu_diff()[j],
+            params[i]->cpu_diff()[j])
             << "param " << i << " diff differed at dim " << j;
       }
     }
@@ -569,9 +571,11 @@ class GradientBasedSolverTest : public MultiDeviceTest<TypeParam> {
     const vector<shared_ptr<Blob<Dtype> > >& history = solver_->history();
     for (int i = 0; i < history.size(); ++i) {
       for (int j = 0; j < history[i]->count(); ++j) {
-        EXPECT_EQ(history_copies[i]->cpu_data()[j], history[i]->cpu_data()[j])
+        EXPECT_FLOAT_EQ(history_copies[i]->cpu_data()[j],
+            history[i]->cpu_data()[j])
             << "history blob " << i << " data differed at dim " << j;
-        EXPECT_EQ(history_copies[i]->cpu_diff()[j], history[i]->cpu_diff()[j])
+        EXPECT_FLOAT_EQ(history_copies[i]->cpu_diff()[j],
+            history[i]->cpu_diff()[j])
             << "history blob " << i << " diff differed at dim " << j;
       }
     }
diff --git a/src/caffe/test/test_hdf5_output_layer.cpp b/src/caffe/test/test_hdf5_output_layer.cpp
index 2bc2de1e647..11d52310cad 100644
--- a/src/caffe/test/test_hdf5_output_layer.cpp
+++ b/src/caffe/test/test_hdf5_output_layer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
 #include <string>
 #include <vector>
 
@@ -20,8 +21,7 @@ class HDF5OutputLayerTest : public MultiDeviceTest<TypeParam> {
 
  protected:
   HDF5OutputLayerTest()
-      : input_file_name_(
-        CMAKE_SOURCE_DIR "caffe/test/test_data/sample_data.h5"),
+      : input_file_name_(ABS_TEST_DATA_DIR "/sample_data.h5"),
         blob_data_(new Blob<Dtype>()),
         blob_label_(new Blob<Dtype>()),
         num_(5),
@@ -121,3 +121,4 @@ TYPED_TEST(HDF5OutputLayerTest, TestForward) {
 }
 
 }  // namespace caffe
+#endif  // USE_HDF5
diff --git a/src/caffe/test/test_hdf5data_layer.cpp b/src/caffe/test/test_hdf5data_layer.cpp
index 487f5176caf..0e5c398f966 100644
--- a/src/caffe/test/test_hdf5data_layer.cpp
+++ b/src/caffe/test/test_hdf5data_layer.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
 #include <string>
 #include <vector>
 
@@ -30,8 +31,7 @@ class HDF5DataLayerTest : public MultiDeviceTest<TypeParam> {
     blob_top_vec_.push_back(blob_top_label2_);
 
     // Check out generate_sample_data.py in the same directory.
-    filename = new string(
-    CMAKE_SOURCE_DIR "caffe/test/test_data/sample_data_list.txt" CMAKE_EXT);
+    filename = new string(ABS_TEST_DATA_DIR "/sample_data_list.txt");
     LOG(INFO)<< "Using sample HDF5 data file " << filename;
   }
 
@@ -164,3 +164,4 @@ TYPED_TEST(HDF5DataLayerTest, TestSkip) {
 }
 
 }  // namespace caffe
+#endif  // USE_HDF5
diff --git a/src/caffe/test/test_infogain_loss_layer.cpp b/src/caffe/test/test_infogain_loss_layer.cpp
index a24ac683dc5..34f21271a62 100644
--- a/src/caffe/test/test_infogain_loss_layer.cpp
+++ b/src/caffe/test/test_infogain_loss_layer.cpp
@@ -1,3 +1,4 @@
+#include <algorithm>
 #include <vector>
 
 #include "gtest/gtest.h"
@@ -18,17 +19,22 @@ class InfogainLossLayerTest : public MultiDeviceTest<TypeParam> {
 
  protected:
   InfogainLossLayerTest()
-      : blob_bottom_data_(new Blob<Dtype>(10, 5, 1, 1)),
-        blob_bottom_label_(new Blob<Dtype>(10, 1, 1, 1)),
+      : blob_bottom_data_(new Blob<Dtype>(4, 2, 5, 2)),
+        blob_bottom_label_(new Blob<Dtype>(4, 2, 1, 2)),
         blob_bottom_infogain_(new Blob<Dtype>(1, 1, 5, 5)),
-        blob_top_loss_(new Blob<Dtype>()) {
+        blob_top_loss_(new Blob<Dtype>()),
+        blob_top_prob_(new Blob<Dtype>()),
+        inner_(2), outer_(4*2), num_labels_(5) {
     Caffe::set_random_seed(1701);
     FillerParameter filler_param;
-    PositiveUnitballFiller<Dtype> filler(filler_param);
+    filler_param.set_min(-0.5);
+    filler_param.set_max(2.0);
+    UniformFiller<Dtype> filler(filler_param);
     filler.Fill(this->blob_bottom_data_);
     blob_bottom_vec_.push_back(blob_bottom_data_);
     for (int i = 0; i < blob_bottom_label_->count(); ++i) {
-      blob_bottom_label_->mutable_cpu_data()[i] = caffe_rng_rand() % 5;
+      blob_bottom_label_->mutable_cpu_data()[i] =
+        caffe_rng_rand() % num_labels_;
     }
     blob_bottom_vec_.push_back(blob_bottom_label_);
     filler_param.set_min(0.1);
@@ -37,29 +43,94 @@ class InfogainLossLayerTest : public MultiDeviceTest<TypeParam> {
     infogain_filler.Fill(this->blob_bottom_infogain_);
     blob_bottom_vec_.push_back(blob_bottom_infogain_);
     blob_top_vec_.push_back(blob_top_loss_);
+    blob_top_vec_.push_back(blob_top_prob_);
   }
   virtual ~InfogainLossLayerTest() {
     delete blob_bottom_data_;
     delete blob_bottom_label_;
     delete blob_bottom_infogain_;
     delete blob_top_loss_;
+    delete blob_top_prob_;
   }
   Blob<Dtype>* const blob_bottom_data_;
   Blob<Dtype>* const blob_bottom_label_;
   Blob<Dtype>* const blob_bottom_infogain_;
   Blob<Dtype>* const blob_top_loss_;
+  Blob<Dtype>* const blob_top_prob_;
   vector<Blob<Dtype>*> blob_bottom_vec_;
   vector<Blob<Dtype>*> blob_top_vec_;
+  int inner_, outer_, num_labels_;
 };
 
 TYPED_TEST_CASE(InfogainLossLayerTest, TestDtypesAndDevices);
 
+TYPED_TEST(InfogainLossLayerTest, TestInfogainLoss) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  layer_param.mutable_infogain_loss_param()->set_axis(2);
+  layer_param.clear_loss_weight();
+  layer_param.add_loss_weight(1);
+  layer_param.add_loss_weight(0);
+  /*vector<float>* lw = layer_param.mutable_loss_weight();
+  lw->clear();
+  lw->push_back(1);
+  lw->push_back(1);*/
+  InfogainLossLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  const Dtype* data = this->blob_bottom_vec_[0]->cpu_data();
+  const Dtype* prob = this->blob_top_vec_[1]->cpu_data();
+  const Dtype* labels = this->blob_bottom_vec_[1]->cpu_data();
+  const Dtype* H = this->blob_bottom_vec_[2]->cpu_data();
+  // first. test the prob top
+  CHECK_EQ(this->blob_bottom_vec_[0]->num_axes(),
+    this->blob_top_vec_[1]->num_axes())
+      << "prob top shape not match bottom data";
+  for (int ai = 0 ; ai < this->blob_bottom_vec_[0]->num_axes(); ai++) {
+    CHECK_EQ(this->blob_bottom_vec_[0]->shape(ai),
+      this->blob_top_vec_[1]->shape(ai))
+        << "prob top shape not match bottom data";
+  }
+  vector<Dtype> est_prob(this->num_labels_, 0);
+  for ( int i = 0 ; i < this->outer_; i++ ) {
+    for ( int j = 0; j < this->inner_; j++ ) {
+      Dtype den = 0;
+      for ( int  l = 0; l < this->num_labels_; l++ ) {
+        est_prob[l] = std::exp(
+          data[i*this->num_labels_*this->inner_ + l*this->inner_ + j]);
+        den += est_prob[l];
+      }
+      for ( int l = 0; l < this->num_labels_; l++ ) {
+        EXPECT_NEAR(prob[i*this->num_labels_*this->inner_ + l*this->inner_ + j],
+          est_prob[l]/den, 1e-6);
+      }
+    }
+  }
+  Dtype loss = 0;  // loss from prob top
+  for ( int i = 0 ; i < this->outer_; i++ ) {
+    for ( int j = 0; j < this->inner_; j++ ) {
+      int gt = static_cast<int>(labels[i*this->inner_+j]);
+      for ( int l = 0; l < this->num_labels_; l++ ) {
+        loss -= H[gt*this->num_labels_ + l] *
+          log(std::max(
+            prob[i*this->num_labels_*this->inner_ + l*this->inner_ + j],
+            Dtype(kLOG_THRESHOLD)));
+      }
+    }
+  }
+  EXPECT_NEAR(this->blob_top_loss_->cpu_data()[0],
+    loss/(this->outer_*this->inner_), 1e-6);
+}
 
 TYPED_TEST(InfogainLossLayerTest, TestGradient) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
+  layer_param.mutable_infogain_loss_param()->set_axis(2);
   InfogainLossLayer<Dtype> layer(layer_param);
-  GradientChecker<Dtype> checker(1e-4, 2e-2, 1701, 1, 0.01);
+  this->blob_top_vec_.clear();  // ignore prob top.
+  this->blob_top_vec_.push_back(this->blob_top_loss_);
+  GradientChecker<Dtype> checker(1e-4, 2e-2, 1701);  // no "kink"
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
       this->blob_top_vec_, 0);
 }
diff --git a/src/caffe/test/test_neuron_layer.cpp b/src/caffe/test/test_neuron_layer.cpp
index 342f825cec3..d1ecc37b661 100644
--- a/src/caffe/test/test_neuron_layer.cpp
+++ b/src/caffe/test/test_neuron_layer.cpp
@@ -10,6 +10,7 @@
 
 #include "caffe/layers/absval_layer.hpp"
 #include "caffe/layers/bnll_layer.hpp"
+#include "caffe/layers/clip_layer.hpp"
 #include "caffe/layers/dropout_layer.hpp"
 #include "caffe/layers/elu_layer.hpp"
 #include "caffe/layers/exp_layer.hpp"
@@ -19,6 +20,7 @@
 #include "caffe/layers/prelu_layer.hpp"
 #include "caffe/layers/relu_layer.hpp"
 #include "caffe/layers/sigmoid_layer.hpp"
+#include "caffe/layers/swish_layer.hpp"
 #include "caffe/layers/tanh_layer.hpp"
 #include "caffe/layers/threshold_layer.hpp"
 
@@ -205,6 +207,66 @@ TYPED_TEST(NeuronLayerTest, TestAbsGradient) {
       this->blob_top_vec_);
 }
 
+TYPED_TEST(NeuronLayerTest, TestClip) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "clip_param { min: -1, max: 2 }", &layer_param));
+  ClipLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+  const Dtype* top_data = this->blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_GE(top_data[i], -1);
+    EXPECT_LE(top_data[i], 2);
+    EXPECT_TRUE(bottom_data[i] > -1 || top_data[i] == -1);
+    EXPECT_TRUE(bottom_data[i] < 2 || top_data[i] == 2);
+    EXPECT_TRUE(!(bottom_data[i] >= -1 && bottom_data[i] <= 2)
+            || top_data[i] == bottom_data[i]);
+  }
+}
+
+TYPED_TEST(NeuronLayerTest, TestClipGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "clip_param { min: -1, max: 2 }", &layer_param));
+  ClipLayer<Dtype> layer(layer_param);
+  // Unfortunately, it might happen that an input value lands exactly within
+  // the discontinuity region of the Clip function. In this case the numeric
+  // gradient is likely to differ significantly (i.e. by a value larger than
+  // checker tolerance) from the computed gradient. To handle such cases, we
+  // eliminate such values from the input blob before the gradient check.
+  const Dtype epsilon = 1e-2;
+  const Dtype min_range_start = layer_param.clip_param().min() - epsilon;
+  const Dtype min_range_end   = layer_param.clip_param().min() + epsilon;
+  const Dtype max_range_start = layer_param.clip_param().max() - epsilon;
+  const Dtype max_range_end   = layer_param.clip_param().max() + epsilon;
+  // The input blob is owned by the NeuronLayerTest object, so we begin with
+  // creating a temporary blob and copying the input data there.
+  Blob<Dtype> temp_bottom;
+  temp_bottom.ReshapeLike(*this->blob_bottom_);
+  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+  Dtype* temp_data_mutable = temp_bottom.mutable_cpu_data();
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    if (bottom_data[i] >= min_range_start &&
+        bottom_data[i] <= min_range_end) {
+      temp_data_mutable[i] = bottom_data[i] - epsilon;
+    } else if (bottom_data[i] >= max_range_start &&
+               bottom_data[i] <= max_range_end) {
+      temp_data_mutable[i] = bottom_data[i] + epsilon;
+    } else {
+      temp_data_mutable[i] = bottom_data[i];
+    }
+  }
+  vector<Blob<Dtype>*> temp_bottom_vec;
+  temp_bottom_vec.push_back(&temp_bottom);
+  GradientChecker<Dtype> checker(epsilon, 1e-3);
+  checker.CheckGradientEltwise(&layer, temp_bottom_vec, this->blob_top_vec_);
+}
+
 TYPED_TEST(NeuronLayerTest, TestReLU) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -344,6 +406,84 @@ TYPED_TEST(NeuronLayerTest, TestSigmoidGradient) {
       this->blob_top_vec_);
 }
 
+TYPED_TEST(NeuronLayerTest, TestSwish) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  SwishLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+  const Dtype* top_data = this->blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_FLOAT_EQ(top_data[i], bottom_data[i] / (1. + exp(-bottom_data[i])));
+  }
+}
+
+TYPED_TEST(NeuronLayerTest, TestSwishWithBeta) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "swish_param { beta: 1.5 }", &layer_param));
+  SwishLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+  const Dtype* top_data = this->blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_FLOAT_EQ(top_data[i], bottom_data[i] / (1. + exp(-1.5 *
+        bottom_data[i])));
+  }
+}
+
+TYPED_TEST(NeuronLayerTest, TestSwishAsLinear) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "swish_param { beta: 0.0 }", &layer_param));
+  SwishLayer<Dtype> layer(layer_param);
+  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Now, check values
+  const Dtype* bottom_data = this->blob_bottom_->cpu_data();
+  const Dtype* top_data = this->blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_bottom_->count(); ++i) {
+    EXPECT_FLOAT_EQ(top_data[i], bottom_data[i] / 2.0);
+  }
+}
+
+TYPED_TEST(NeuronLayerTest, TestSwishGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  SwishLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(NeuronLayerTest, TestSwishWithBetaGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "swish_param { beta: 1.5 }", &layer_param));
+  SwishLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
+TYPED_TEST(NeuronLayerTest, TestSwishAsLinearGradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  CHECK(google::protobuf::TextFormat::ParseFromString(
+      "swish_param { beta: 0.0 }", &layer_param));
+  SwishLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
+  checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
 TYPED_TEST(NeuronLayerTest, TestTanH) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -791,16 +931,19 @@ TYPED_TEST(NeuronLayerTest, TestPReLUInPlace) {
   ip2.Backward(blob_middle_vec_2, propagate_down, blob_bottom_vec_2);
   // Check numbers
   for (int s = 0; s < blob_bottom_2->count(); ++s) {
-    EXPECT_EQ(this->blob_bottom_->cpu_diff()[s], blob_bottom_2->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(this->blob_bottom_->cpu_diff()[s],
+        blob_bottom_2->cpu_diff()[s]);
   }
   for (int s = 0; s < ip.blobs()[0]->count(); ++s) {
-    EXPECT_EQ(ip.blobs()[0]->cpu_diff()[s], ip2.blobs()[0]->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(ip.blobs()[0]->cpu_diff()[s],
+        ip2.blobs()[0]->cpu_diff()[s]);
   }
   for (int s = 0; s < ip.blobs()[1]->count(); ++s) {
-    EXPECT_EQ(ip.blobs()[1]->cpu_diff()[s], ip2.blobs()[1]->cpu_diff()[s]);
+    EXPECT_FLOAT_EQ(ip.blobs()[1]->cpu_diff()[s],
+        ip2.blobs()[1]->cpu_diff()[s]);
   }
   for (int s = 0; s < prelu.blobs()[0]->count(); ++s) {
-    EXPECT_EQ(prelu.blobs()[0]->cpu_diff()[s],
+    EXPECT_FLOAT_EQ(prelu.blobs()[0]->cpu_diff()[s],
         prelu2.blobs()[0]->cpu_diff()[s]);
   }
 }
diff --git a/src/caffe/test/test_syncedmem.cpp b/src/caffe/test/test_syncedmem.cpp
index 16dfb58230f..2ca9ca2f998 100644
--- a/src/caffe/test/test_syncedmem.cpp
+++ b/src/caffe/test/test_syncedmem.cpp
@@ -80,7 +80,7 @@ TEST_F(SyncedMemoryTest, TestGPURead) {
   char* recovered_value = new char[10];
   caffe_gpu_memcpy(10, gpu_data, recovered_value);
   for (int i = 0; i < mem.size(); ++i) {
-    EXPECT_EQ((static_cast<char*>(recovered_value))[i], 1);
+    EXPECT_EQ(recovered_value[i], 1);
   }
   // do another round
   cpu_data = mem.mutable_cpu_data();
@@ -94,7 +94,7 @@ TEST_F(SyncedMemoryTest, TestGPURead) {
   // check if values are the same
   caffe_gpu_memcpy(10, gpu_data, recovered_value);
   for (int i = 0; i < mem.size(); ++i) {
-    EXPECT_EQ((static_cast<char*>(recovered_value))[i], 2);
+    EXPECT_EQ(recovered_value[i], 2);
   }
   delete[] recovered_value;
 }
diff --git a/src/caffe/test/test_upgrade_proto.cpp b/src/caffe/test/test_upgrade_proto.cpp
index 9dcc2aa55ec..769112ebfe7 100644
--- a/src/caffe/test/test_upgrade_proto.cpp
+++ b/src/caffe/test/test_upgrade_proto.cpp
@@ -2952,6 +2952,8 @@ TEST_F(SolverTypeUpgradeTest, TestSimple) {
   for (int i = 0; i < 6; ++i) {
     const string& input_proto =
         "net: 'examples/mnist/lenet_train_test.prototxt' "
+        "weights: 'examples/mnist/lenet_train_test1.caffemodel' "
+        "weights: 'examples/mnist/lenet_train_test2.caffemodel' "
         "test_iter: 100 "
         "test_interval: 500 "
         "base_lr: 0.01 "
@@ -2968,6 +2970,8 @@ TEST_F(SolverTypeUpgradeTest, TestSimple) {
         "solver_type: " + std::string(old_type_vec[i]) + " ";
     const string& expected_output_proto =
         "net: 'examples/mnist/lenet_train_test.prototxt' "
+        "weights: 'examples/mnist/lenet_train_test1.caffemodel' "
+        "weights: 'examples/mnist/lenet_train_test2.caffemodel' "
         "test_iter: 100 "
         "test_interval: 500 "
         "base_lr: 0.01 "
diff --git a/src/caffe/util/hdf5.cpp b/src/caffe/util/hdf5.cpp
index ed73742937f..cefd853dff4 100644
--- a/src/caffe/util/hdf5.cpp
+++ b/src/caffe/util/hdf5.cpp
@@ -1,3 +1,4 @@
+#ifdef USE_HDF5
 #include "caffe/util/hdf5.hpp"
 
 #include <string>
@@ -207,3 +208,4 @@ string hdf5_get_name_by_idx(hid_t loc_id, int idx) {
 }
 
 }  // namespace caffe
+#endif  // USE_HDF5
diff --git a/src/caffe/util/io.cpp b/src/caffe/util/io.cpp
index 835d2d4e4ff..5295d9dddb9 100644
--- a/src/caffe/util/io.cpp
+++ b/src/caffe/util/io.cpp
@@ -106,7 +106,7 @@ cv::Mat ReadImageToCVMat(const string& filename) {
 static bool matchExt(const std::string & fn,
                      std::string en) {
   size_t p = fn.rfind('.');
-  std::string ext = p != fn.npos ? fn.substr(p) : fn;
+  std::string ext = p != fn.npos ? fn.substr(p+1) : fn;
   std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
   std::transform(en.begin(), en.end(), en.begin(), ::tolower);
   if ( ext == en )
diff --git a/src/caffe/util/math_functions.cpp b/src/caffe/util/math_functions.cpp
index 71c02274a75..59625bc05ce 100644
--- a/src/caffe/util/math_functions.cpp
+++ b/src/caffe/util/math_functions.cpp
@@ -196,6 +196,16 @@ void caffe_sqr<double>(const int n, const double* a, double* y) {
   vdSqr(n, a, y);
 }
 
+template <>
+void caffe_sqrt<float>(const int n, const float* a, float* y) {
+  vsSqrt(n, a, y);
+}
+
+template <>
+void caffe_sqrt<double>(const int n, const double* a, double* y) {
+  vdSqrt(n, a, y);
+}
+
 template <>
 void caffe_exp<float>(const int n, const float* a, float* y) {
   vsExp(n, a, y);
diff --git a/src/caffe/util/math_functions.cu b/src/caffe/util/math_functions.cu
index 6d001026082..314e6ba0f63 100644
--- a/src/caffe/util/math_functions.cu
+++ b/src/caffe/util/math_functions.cu
@@ -387,6 +387,27 @@ void caffe_gpu_powx<double>(const int N, const double* a,
       N, a, alpha, y);
 }
 
+template <typename Dtype>
+__global__ void sqrt_kernel(const int n, const Dtype* a, Dtype* y) {
+  CUDA_KERNEL_LOOP(index, n) {
+    y[index] = sqrt(a[index]);
+  }
+}
+
+template <>
+void caffe_gpu_sqrt<float>(const int N, const float* a, float* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  sqrt_kernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+      N, a, y);
+}
+
+template <>
+void caffe_gpu_sqrt<double>(const int N, const double* a, double* y) {
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  sqrt_kernel<double><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS>>>(
+      N, a, y);
+}
+
 DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index])
                                       - (x[index] < Dtype(0)));
 DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index]));
diff --git a/src/caffe/util/signal_handler.cpp b/src/caffe/util/signal_handler.cpp
index 5d764ec524f..9658fb390ea 100644
--- a/src/caffe/util/signal_handler.cpp
+++ b/src/caffe/util/signal_handler.cpp
@@ -48,7 +48,7 @@ namespace {
   void UnhookHandler() {
     if (already_hooked_up) {
       struct sigaction sa;
-      // Setup the sighub handler
+      // Setup the sighup handler
       sa.sa_handler = SIG_DFL;
       // Restart the system call, if at all possible
       sa.sa_flags = SA_RESTART;
diff --git a/src/caffe/util/upgrade_proto.cpp b/src/caffe/util/upgrade_proto.cpp
index 94771c8c050..ad40b73d295 100644
--- a/src/caffe/util/upgrade_proto.cpp
+++ b/src/caffe/util/upgrade_proto.cpp
@@ -2,6 +2,8 @@
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/text_format.h>
 
+#include <boost/filesystem.hpp>
+
 #include <map>
 #include <string>
 
@@ -1095,12 +1097,31 @@ bool UpgradeSolverAsNeeded(const string& param_file, SolverParameter* param) {
   return success;
 }
 
+// Replaces snapshot_prefix of SolverParameter if it is not specified
+// or is set to directory
+void UpgradeSnapshotPrefixProperty(const string& param_file,
+                                   SolverParameter* param) {
+  using boost::filesystem::path;
+  using boost::filesystem::is_directory;
+  if (!param->has_snapshot_prefix()) {
+    param->set_snapshot_prefix(path(param_file).replace_extension().string());
+    LOG(INFO) << "snapshot_prefix was not specified and is set to "
+                + param->snapshot_prefix();
+  } else if (is_directory(param->snapshot_prefix())) {
+    param->set_snapshot_prefix((path(param->snapshot_prefix()) /
+                               path(param_file).stem()).string());
+    LOG(INFO) << "snapshot_prefix was a directory and is replaced to "
+                + param->snapshot_prefix();
+  }
+}
+
 // Read parameters from a file into a SolverParameter proto message.
 void ReadSolverParamsFromTextFileOrDie(const string& param_file,
                                        SolverParameter* param) {
   CHECK(ReadProtoFromTextFile(param_file, param))
       << "Failed to parse SolverParameter file: " << param_file;
   UpgradeSolverAsNeeded(param_file, param);
+  UpgradeSnapshotPrefixProperty(param_file, param);
 }
 
 }  // namespace caffe
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 02fbd5cadd8..3789450555e 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -25,5 +25,6 @@ foreach(source ${srcs})
   endif()
 
   # Install
-  install(TARGETS ${name} DESTINATION bin)
+  install(TARGETS ${name} DESTINATION ${CMAKE_INSTALL_BINDIR})
+
 endforeach(source)
diff --git a/tools/caffe.cpp b/tools/caffe.cpp
index 3587d8aa1be..389cfb8a99e 100644
--- a/tools/caffe.cpp
+++ b/tools/caffe.cpp
@@ -146,20 +146,6 @@ int device_query() {
 }
 RegisterBrewFunction(device_query);
 
-// Load the weights from the specified caffemodel(s) into the train and
-// test nets.
-void CopyLayers(caffe::Solver<float>* solver, const std::string& model_list) {
-  std::vector<std::string> model_names;
-  boost::split(model_names, model_list, boost::is_any_of(",") );
-  for (int i = 0; i < model_names.size(); ++i) {
-    LOG(INFO) << "Finetuning from " << model_names[i];
-    solver->net()->CopyTrainedLayersFrom(model_names[i]);
-    for (int j = 0; j < solver->test_nets().size(); ++j) {
-      solver->test_nets()[j]->CopyTrainedLayersFrom(model_names[i]);
-    }
-  }
-}
-
 // Translate the signal effect the user specified on the command-line to the
 // corresponding enumeration.
 caffe::SolverAction::Enum GetRequestedAction(
@@ -233,6 +219,13 @@ int train() {
         GetRequestedAction(FLAGS_sigint_effect),
         GetRequestedAction(FLAGS_sighup_effect));
 
+  if (FLAGS_snapshot.size()) {
+    solver_param.clear_weights();
+  } else if (FLAGS_weights.size()) {
+    solver_param.clear_weights();
+    solver_param.add_weights(FLAGS_weights);
+  }
+
   shared_ptr<caffe::Solver<float> >
       solver(caffe::SolverRegistry<float>::CreateSolver(solver_param));
 
@@ -241,8 +234,6 @@ int train() {
   if (FLAGS_snapshot.size()) {
     LOG(INFO) << "Resuming from " << FLAGS_snapshot;
     solver->Restore(FLAGS_snapshot.c_str());
-  } else if (FLAGS_weights.size()) {
-    CopyLayers(solver.get(), FLAGS_weights);
   }
 
   LOG(INFO) << "Starting Optimization";
diff --git a/tools/compute_image_mean.cpp b/tools/compute_image_mean.cpp
index 2035d515195..417f5e4c622 100644
--- a/tools/compute_image_mean.cpp
+++ b/tools/compute_image_mean.cpp
@@ -22,9 +22,11 @@ DEFINE_string(backend, "lmdb",
         "The backend {leveldb, lmdb} containing the images");
 
 int main(int argc, char** argv) {
+#ifdef USE_OPENCV
   ::google::InitGoogleLogging(argv[0]);
+  // Print output to stderr (while still logging)
+  FLAGS_alsologtostderr = 1;
 
-#ifdef USE_OPENCV
 #ifndef GFLAGS_GFLAGS_H_
   namespace gflags = google;
 #endif
@@ -65,7 +67,7 @@ int main(int argc, char** argv) {
   for (int i = 0; i < size_in_datum; ++i) {
     sum_blob.add_data(0.);
   }
-  LOG(INFO) << "Starting Iteration";
+  LOG(INFO) << "Starting iteration";
   while (cursor->valid()) {
     Datum datum;
     datum.ParseFromString(cursor->value());
@@ -114,7 +116,7 @@ int main(int argc, char** argv) {
     for (int i = 0; i < dim; ++i) {
       mean_values[c] += sum_blob.data(dim * c + i);
     }
-    LOG(INFO) << "mean_value channel [" << c << "]:" << mean_values[c] / dim;
+    LOG(INFO) << "mean_value channel [" << c << "]: " << mean_values[c] / dim;
   }
 #else
   LOG(FATAL) << "This tool requires OpenCV; compile with USE_OPENCV.";
diff --git a/tools/convert_imageset.cpp b/tools/convert_imageset.cpp
index 90cdb15d427..9c5d09f9bef 100644
--- a/tools/convert_imageset.cpp
+++ b/tools/convert_imageset.cpp
@@ -115,7 +115,7 @@ int main(int argc, char** argv) {
       size_t p = fn.rfind('.');
       if ( p == fn.npos )
         LOG(WARNING) << "Failed to guess the encoding of '" << fn << "'";
-      enc = fn.substr(p);
+      enc = fn.substr(p+1);
       std::transform(enc.begin(), enc.end(), enc.begin(), ::tolower);
     }
     status = ReadImageToDatum(root_folder + lines[line_id].first,
diff --git a/tools/device_query.cpp b/tools/device_query.cpp
deleted file mode 100644
index 03799e52b53..00000000000
--- a/tools/device_query.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "caffe/common.hpp"
-
-int main(int argc, char** argv) {
-  LOG(FATAL) << "Deprecated. Use caffe device_query "
-                "[--device_id=0] instead.";
-  return 0;
-}
diff --git a/tools/extra/parse_log.sh b/tools/extra/parse_log.sh
index 9892c897682..122eb9e6eed 100755
--- a/tools/extra/parse_log.sh
+++ b/tools/extra/parse_log.sh
@@ -39,7 +39,7 @@ rm aux.txt aux0.txt aux1.txt aux2.txt aux3.txt aux4.txt
 grep '] Solving ' $1 > aux.txt
 grep ', loss = ' $1 >> aux.txt
 grep 'Iteration ' aux.txt | sed  's/.*Iteration \([[:digit:]]*\).*/\1/g' > aux0.txt
-grep ', loss = ' $1 | awk '{print $9}' > aux1.txt
+grep ', loss = ' $1 | awk -F = '{print $2}' > aux1.txt
 grep ', lr = ' $1 | awk '{print $9}' > aux2.txt
 
 # Extracting elapsed seconds
diff --git a/tools/extra/resize_and_crop_images.py b/tools/extra/resize_and_crop_images.py
index c844f590c06..fd2c3134edb 100755
--- a/tools/extra/resize_and_crop_images.py
+++ b/tools/extra/resize_and_crop_images.py
@@ -101,7 +101,7 @@ def map(self, key, value):
         yield value, FLAGS.output_folder
 
 mapreducer.REGISTER_DEFAULT_MAPPER(ResizeCropImagesMapper)
-
+mapreducer.REGISTER_DEFAULT_REDUCER(mapreducer.NoPassReducer)
 mapreducer.REGISTER_DEFAULT_READER(mapreducer.FileReader)
 mapreducer.REGISTER_DEFAULT_WRITER(mapreducer.FileWriter)
  
diff --git a/tools/finetune_net.cpp b/tools/finetune_net.cpp
deleted file mode 100644
index 81c0c354dbf..00000000000
--- a/tools/finetune_net.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "caffe/caffe.hpp"
-
-int main(int argc, char** argv) {
-  LOG(FATAL) << "Deprecated. Use caffe train --solver=... "
-                "[--weights=...] instead.";
-  return 0;
-}
diff --git a/tools/net_speed_benchmark.cpp b/tools/net_speed_benchmark.cpp
deleted file mode 100644
index cd16e8d0984..00000000000
--- a/tools/net_speed_benchmark.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "caffe/caffe.hpp"
-
-int main(int argc, char** argv) {
-  LOG(FATAL) << "Deprecated. Use caffe time --model=... "
-             "[--iterations=50] [--gpu] [--device_id=0]";
-  return 0;
-}
diff --git a/tools/test_net.cpp b/tools/test_net.cpp
deleted file mode 100644
index 92e14eeebaf..00000000000
--- a/tools/test_net.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "caffe/caffe.hpp"
-
-int main(int argc, char** argv) {
-  LOG(FATAL) << "Deprecated. Use caffe test --model=... "
-      "--weights=... instead.";
-  return 0;
-}
diff --git a/tools/train_net.cpp b/tools/train_net.cpp
deleted file mode 100644
index 622bca311c8..00000000000
--- a/tools/train_net.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "caffe/caffe.hpp"
-
-int main(int argc, char** argv) {
-  LOG(FATAL) << "Deprecated. Use caffe train --solver=... "
-                "[--snapshot=...] instead.";
-  return 0;
-}